In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
train_df = pd.read_csv("/content/drive/MyDrive/Unstop dataset/train.tsv", delimiter='\t')
test_df = pd.read_csv("/content/drive/MyDrive/Unstop dataset/test.tsv", delimiter='\t')

In [4]:
train_df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0


In [5]:
test_df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,8104,Conservatives Will HATE What Donald Trump Just...,Donald Trump isn t exactly a stranger to makin...,News,"February 14, 2016",0
1,7467,Trump victory may create new tension between U...,Donald Trump’s U.S. election victory may creat...,politicsNews,"November 9, 2016",1
2,9473,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,A couple of quick questions come to mind when ...,politics,"Nov 9, 2017",0
3,276,"Democratic Senator Franken to resign: CNN, cit...",U.S. Democratic Senator Al Franken will announ...,politicsNews,"December 7, 2017",1
4,19274,GANG OF DOMESTIC TERRORISTS Violently Attack L...,***WARNING*** Violence is graphic***This Trump...,left-news,"Jan 21, 2017",0


In [6]:
# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [7]:
# Apply preprocessing to Title and Text columns
train_df['processed_title'] = train_df['title'].apply(preprocess_text)
train_df['processed_text'] = train_df['text'].apply(preprocess_text)

In [8]:
test_df['processed_title'] = test_df['title'].apply(preprocess_text)
test_df['processed_text'] = test_df['text'].apply(preprocess_text)

In [9]:
# Combine Title and Text for feature extraction
train_df['combined'] = train_df['processed_title'] + " " + train_df['processed_text']
test_df['combined'] = test_df['processed_title'] + " " + test_df['processed_text']

In [10]:
# Encode the target labels (Fake=0, Real=1)
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

In [11]:
# Extract features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features if needed
X_train_tfidf = vectorizer.fit_transform(train_df['combined'])
X_test_tfidf = vectorizer.transform(test_df['combined'])

In [12]:
# Model Training using Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, train_df['label'])

In [13]:
# Predict on the test set
y_pred = model.predict(X_test_tfidf)

In [14]:
# Evaluate the model
accuracy = accuracy_score(test_df['label'], y_pred)
precision = precision_score(test_df['label'], y_pred)
recall = recall_score(test_df['label'], y_pred)
f1 = f1_score(test_df['label'], y_pred)
auc_roc = roc_auc_score(test_df['label'], model.predict_proba(X_test_tfidf)[:, 1])

In [15]:
# Output the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'AUC-ROC: {auc_roc:.4f}')

Accuracy: 0.9819
Precision: 0.9778
Recall: 0.9847
F1 Score: 0.9812
AUC-ROC: 0.9984


In [16]:
print("Accuracy: ",accuracy * 100)

Accuracy:  98.18555703399056


In [17]:
# Confusion Matrix
conf_matrix = confusion_matrix(test_df['label'], y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[4195   89]
 [  61 3922]]


In [18]:
# Save the predictions in the required format
output = []
for idx, title in enumerate(test_df['title']):
    output.append([title, y_pred[idx]])

In [23]:
# Save the results to a text file
with open("result.txt", "w") as f:
    for row in output:
        f.write(f'["{row[0]}", {row[1]}]\n')

In [24]:
# Save the results to a text file
output_file_path = "/content/drive/MyDrive/Unstop dataset/result.txt"
with open(output_file_path, "w") as f:
    for row in output:
        f.write(f'["{row[0]}", {row[1]}]\n')

print(f"Results saved to {output_file_path}")


Results saved to /content/drive/MyDrive/Unstop dataset/result.txt
