In [1]:
pip install gensim


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split  
data = pd.read_csv("news.csv")

In [3]:
print(data.head())  # View the first few rows


   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


In [4]:
import string
punct = set(string.punctuation)

In [5]:
def clean_text(text):
  text = text.lower()  # Lowercase
  text = ''.join([char for char in text if char not in punct])  # Remove punctuation
  # You can add stop word removal, tokenization, stemming/lemmatization here
  return text

In [6]:
data["text"] = data["text"].apply(clean_text)
if "Label" in data.columns:  # Check if a label column exists (assuming binary classification)
  X_train, X_test, y_train, y_test = train_test_split(data["Text"], data["Label"], test_size=0.2, random_state=42)

In [7]:
import pandas as pd

# ... (other code)

# Check for missing values
print(data.isnull().sum())

# Handle missing values (choose one approach):
# Option 1: Remove rows with missing values
data = data.dropna()

# Option 2: Impute missing values (e.g., with a default value)
data["text"] = data["text"].fillna("NA")  # Replace with appropriate value

# ... (rest of your code)


Unnamed: 0    0
title         0
text          0
label         0
dtype: int64


In [8]:
import pandas as pd

# Assuming your data is loaded into a pandas DataFrame named 'data'
# with a column named 'Label' containing the labels (REAL or FAKE)

# Define a dictionary for label mapping
label_map = {"FAKE": 0, "REAL": 1}

# Apply the mapping to the 'Label' column using replace
data['label'] = data['label'].replace(label_map)

# Now the 'Label' column has 0 for FAKE and 1 for REAL


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer  # Feature extraction
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.model_selection import train_test_split  # Train-test split
from sklearn.metrics import accuracy_score, precision_score, recall_score  # Evaluation metrics

# Assuming your data is loaded into a pandas DataFrame named 'data'
# with columns 'text' (text of the news article) and 'Label' (REAL or FAKE label)

# 1. Feature Extraction (TF-IDF):
vectorizer = TfidfVectorizer(max_features=2000)  # Extract 2000 most informative features
features = vectorizer.fit_transform(data["text"])

# 2. Train-Test Split:
X_train, X_test, y_train, y_test = train_test_split(features, data["label"], test_size=0.2, random_state=42)

# 3. Model Training (Random Forest):
random_forest_model = RandomForestClassifier(n_estimators=100)  # Initialize Random Forest model
random_forest_model.fit(X_train, y_train)

# 4. Model Prediction on Test Set:
y_pred = random_forest_model.predict(X_test)

# 5. Model Evaluation:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Now you have a trained Random Forest model and can use it to predict on new unseen data


Accuracy: 0.9139700078926598
Precision: 0.9233226837060703
Recall: 0.9045383411580594


In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer  # Feature extraction
from sklearn.model_selection import train_test_split  # Train-test split
from sklearn.metrics import accuracy_score, precision_score, recall_score  # Evaluation metrics

def evaluate_model(model, X_test, y_test):
  """
  Evaluates the performance of the trained model using accuracy, precision, and recall.

  Args:
      model: The trained model object (assuming labels are already numerical).
      X_test: The TF-IDF features of the testing data.
      y_test: The true labels (numerical, e.g., 0 for FAKE, 1 for REAL) of the testing data.

  Prints the evaluation metrics.
  """
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)

def improve_model(model, features, labels, model_type):
  """
  Provides a framework for potential model improvement strategies.

  Args:
      model: The trained model object.
      features: The TF-IDF features from the data.
      labels: The true labels (numerical) for the news articles.
      model_type: The type of model used (e.g., "logistic_regression", "random_forest", "svm").

  This function is a guide and needs to be implemented based on the chosen model_type.
  """
  # Example hyperparameter tuning using GridSearchCV (adapt parameters for each model)
  if model_type == "logistic_regression":
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    }
  elif model_type == "random_forest":
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 8]
    }
  elif model_type == "svm":
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }
  else:
    raise ValueError("Invalid model type. Choose from logistic_regression, random_forest, or svm.")

  grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
  grid_search.fit(features, labels)  # Added closing parenthesis here
  print("Best hyperparameters:", grid_search.best_params_)

  # Example feature engineering (replace with your chosen techniques)
  # You can explore techniques like sentiment analysis, named entity recognition, etc.
  # ... (implementation based on your chosen feature engineering approach)
  # ...


In [11]:
from scipy import sparse  # Assuming features is a SciPy sparse matrix

# Access the text data directly (modify if stored differently)
text_data = features[:, 0].toarray()  # This might be incorrect in your case

# Check if this provides the expected text content (print or explore further)
print(text_data)

# ... your existing code using the text_data for sentiment analysis

# Optionally, consider alternative feature extraction methods
# that preserve the original text content (e.g., TF-IDF on raw text)


[[0.        ]
 [0.        ]
 [0.        ]
 ...
 [0.        ]
 [0.01950994]
 [0.        ]]


In [15]:
# Assuming text_data is a single element (not iterable)
sentiment_features = []  # Empty list to store sentiment features

# You can't iterate through a single element using a for loop

# ... your existing code using sentiment_features (modify if necessary)


In [13]:
for feature_vector in text_data:
    word_list = ...  # Your code to convert feature_vector to word_list
    print(word_list)
    # ... rest of the loop


Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
Ellipsis
E

In [14]:
sentiment_features = []
for feature_vector in text_data:
    # Assuming feature_vector is a NumPy array containing encoded features
    text_embedding = feature_vector.mean(axis=0)  # Average across all features
    # Use text_embedding for sentiment analysis
    sentiment_features.append(...)  # Replace with sentiment analysis logic


In [16]:
import gensim.downloader as api

# Download a pre-trained word embedding model (adjust model name)
model = api.load('word2vec-google-news-300')

# Assuming text_data contains encoded features (e.g., TF-IDF vectors)
sentiment_features = []
for feature_vector in text_data:
    # Convert encoded vector to a list of words (assuming vocabulary mapping)
    word_list = ...  # Replace with code to convert encoded vector to word list

    # Calculate average word embedding for the text (modify if necessary)
    text_embedding = sum(model.wv[word] for word in word_list) / len(word_list)

    # Use text_embedding for sentiment analysis with appropriate models
    # (e.g., train a separate sentiment classifier on text embeddings)
    sentiment_features.append(...)  # Replace with sentiment analysis logic

# ... your existing code using sentiment_features


TypeError: 'ellipsis' object is not iterable