<a href="https://colab.research.google.com/github/anishathakrar/Identifying_Misinformation_IR/blob/main/Identifying_Misinformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**CS6200 - Information Retrieval** \
**Anisha Thakrar**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
%cd /content/drive/MyDrive/Colab Notebooks/CS6200

/content/drive/MyDrive/Colab Notebooks/CS6200


In [28]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

from nltk import corpus
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Reading train and test files**

In [11]:
train_df = pd.read_csv('training.csv')
train_df.head()

Unnamed: 0,Id,title,text,subject,date,label
0,13970,WATCH: Paul Manafort Gets DESTROYED On CNN Fo...,"Paul Manafort, the chair of Donald Trump s cam...",News,"August 14, 2016",1
1,41668,REFUGEE BUSINESS IS CASH COW FOR LUTHERAN CHAR...,How very charitable of the Lutheran and Cath...,left-news,"May 8, 2015",1
2,26810,Trump’s Excuse For All His Sexist Statements ...,In case you ve been living under a rock for yo...,News,"March 28, 2016",1
3,30967,'Jihadi Gran' gets 10 years after joining son ...,PARIS (Reuters) - A court on Friday sentenced ...,worldnews,"October 6, 2017",0
4,26072,New Czech government to seek confidence vote o...,PRAGUE (Reuters) - The Czech government will s...,worldnews,"December 13, 2017",0


In [10]:
# Records in training data
len(train_df)

31428

In [14]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Id,title,text,subject,date
0,22216,Five killed in sectarian attack in Pakistan,"QUETTA, Pakistan (Reuters) - (This October 9 s...",worldnews,"October 9, 2017"
1,27917,White House narrows search to three for Suprem...,"WASHINGTON/AUSTIN, Texas (Reuters) - The White...",politicsNews,"March 11, 2016"
2,25007,THE DEATH OF P.C. POLICE? Trump’s DOJ Makes Ma...,The Justice Department on Thursday dropped the...,politics,"Jun 30, 2017"
3,1377,"House, Senate Republicans face challenge over ...",WASHINGTON (Reuters) - U.S. Republicans in Con...,politicsNews,"December 5, 2017"
4,32476,China complains about Taiwan content in U.S. d...,BEIJING (Reuters) - China said on Thursday it ...,worldnews,"December 14, 2017"


In [13]:
# Records in test data
len(test_df)

13470

**Data Cleaning**

In [22]:
# Drop NA values if any
train_df.isna().sum() # no NA values

Id         0
title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [25]:
# Remove stopwords from text 
stop_words = corpus.stopwords.words('english')
def remove_stopwords(text):
    return ' '.join(word.lower() for word in text.split(' ') if word not in stop_words)

train_df.text = train_df.text.apply(remove_stopwords)

In [26]:
# Remove junk characters from text 
train_df.text = train_df.text.str.replace('\d+','')

  train_df.text = train_df.text.str.replace('\d+','')


In [29]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatizing(text):
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split(' ') )

train_df.text = train_df.text.apply(lemmatizing)

In [30]:
# Create TF-IDF
tfidf =TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = tfidf.fit_transform(train_df.text).toarray()

In [32]:
X.shape

(31428, 5000)

In [33]:
y = train_df.label

**Naive Bayes Classifier**

In [36]:
# train-test split
x_train, x_dev, y_train, y_dev = train_test_split(X, y, train_size=0.25, random_state = 42)

# Fitting the model
classifier = MultinomialNB().fit(x_train, y_train)
preds = classifier.predict(x_dev)
preds[:50]

array([1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0])

In [55]:
# Model evaluation
print("Accuracy:",round(metrics.accuracy_score(y_dev, preds),4))
print("Precision:",round(metrics.precision_score(y_dev, preds),4))
print("Recall:",round(metrics.recall_score(y_dev, preds),4))

Accuracy: 0.9494
Precision: 0.9513
Recall: 0.9526


**Topic Relevance**

In [56]:
# Topics in the dataset
train_df["subject"].value_counts()

politicsNews       7863
worldnews          7088
News               6378
politics           4832
left-news          3096
Government News    1089
US_News             546
Middle-east         536
Name: subject, dtype: int64

In [57]:
# Encoding subject column
train_df['subject'] = train_df['subject'].astype('category')
train_df['subject_code'] = train_df['subject'].cat.codes
train_df.head()

Unnamed: 0,Id,title,text,subject,date,label,subject_code
0,13970,WATCH: Paul Manafort Gets DESTROYED On CNN Fo...,"paul manafort, chair donald trump campaign, go...",News,"August 14, 2016",1,2
1,41668,REFUGEE BUSINESS IS CASH COW FOR LUTHERAN CHAR...,charitable lutheran catholic church bring te...,left-news,"May 8, 2015",1,4
2,26810,Trump’s Excuse For All His Sexist Statements ...,"case living rock entire life, especially campa...",News,"March 28, 2016",1,2
3,30967,'Jihadi Gran' gets 10 years after joining son ...,paris (reuters) - court friday sentenced radic...,worldnews,"October 6, 2017",0,7
4,26072,New Czech government to seek confidence vote o...,prague (reuters) - czech government seek vote ...,worldnews,"December 13, 2017",0,7


In [62]:
# Create TF-IDF
X_multi = tfidf.fit_transform(train_df.text).toarray()
y_multi = train_df.subject_code

# train-test split
x_multi_train,x_multi_dev,y_multi_train,y_multi_dev = train_test_split(X_multi, y_multi, train_size=0.25, random_state = 42)

# Fitting the model with y as subject code
multi_classifier = MultinomialNB().fit(x_multi_train, y_multi_train)
preds_multi = multi_classifier.predict(x_multi_dev)
preds_multi[:50]

array([2, 6, 2, 6, 6, 2, 6, 2, 2, 2, 6, 7, 7, 5, 6, 7, 3, 6, 7, 6, 6, 5,
       6, 7, 2, 2, 6, 2, 5, 6, 5, 2, 6, 7, 6, 5, 6, 5, 6, 5, 5, 2, 6, 6,
       5, 2, 6, 2, 6, 6], dtype=int8)

In [70]:
# Model evaluation
print("Accuracy:",round(metrics.accuracy_score(y_multi_dev, preds_multi),4))
print("Precision:",round(metrics.precision_score(y_multi_dev, preds_multi, average="weighted"),4))
print("Recall:",round(metrics.recall_score(y_multi_dev, preds_multi, average="weighted"),4))

Accuracy: 0.7086
Precision: 0.6447
Recall: 0.7086


**Test Data Preparation** 

In [71]:
# Data cleaning 
test_df = test_df.dropna()
test_df.text = test_df.text.apply(remove_stopwords)
test_df.text = test_df.text.apply(lemmatizing)
test_df.text

0        quetta, pakistan (reuters) - (this october 9 s...
1        washington/austin, texas (reuters) - the white...
2        the justice department thursday dropped legal ...
3        washington (reuters) - u.s. republican congres...
4        beijing (reuters) - china said thursday compla...
                               ...                        
13465    ron reagan, late president reagan son, underst...
13466    a prolific trump foe found viral fame boasting...
13467    washington (reuters) - u.s. president donald t...
13468    beijing (reuters) - a young chinese climbing e...
13469    hillary lost lead swing state oh, nv, ia fl si...
Name: text, Length: 13470, dtype: object

In [72]:
X_test = tfidf.fit_transform(test_df.text).toarray()

In [73]:
X_test.shape

(13470, 5000)

In [74]:
pred_test = classifier.predict(X_test)
pred_test[:50]

array([0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0])

In [76]:
subject_code_preds = multi_classifier.predict(X_test)
subject_code_preds[:50]

array([0, 0, 2, 6, 7, 5, 0, 2, 5, 6, 6, 7, 6, 5, 0, 6, 5, 5, 5, 5, 0, 0,
       5, 6, 0, 7, 0, 4, 5, 5, 5, 5, 0, 2, 5, 6, 6, 4, 0, 6, 0, 5, 0, 4,
       0, 5, 5, 4, 0, 0], dtype=int8)

**Calculate doc-query similarity**

In [89]:
titles = list(train_df.title)
docs = list(train_df.text)
ids = list(train_df.Id)

In [90]:
X = tfidf.fit_transform(docs)
# Convert the X as transposed matrix
X = X.T.toarray()
# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X, index=tfidf.get_feature_names_out())

**Calculate similarity score between query and document**

In [91]:
def get_similar_articles(q, df, n):
  print("query:", q)
  print("search results: ")
  ans_df = pd.DataFrame()
  ans_title = []
  ans_sim = []
  ans_id = []

  # Convert the query become a vector
  q = [q]
  q_vec = tfidf.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  
  # Calculate the similarity
  for i in range(df.shape[1]):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  # Sort the values 
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  # Print the articles and their similarity values
  for k, v in sim_sorted:
    if v != 0.0:
      # print("Cosine Similarity:", v)
      ans_sim.append(v)
      # print(titles[k])
      ans_title.append(titles[k])
      ans_id.append(ids[k])
      # print()
  ans_df['id'] = ans_id
  ans_df['title'] = ans_title
  ans_df['sim'] = ans_sim
  return ans_df[:n]

**Basic search engine**

In [92]:
# Add The Query
q1 = 'trump'

# Get 100 similar documents
ans = get_similar_articles(q1, df, 100)
ans

query: trump
search results: 


  sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)


Unnamed: 0,id,title,sim
0,20109,BUSTED: Steel Unions FURIOUS At Trump For Des...,0.345131
1,33088,Leading Senator In Trump-Russia Investigation...,0.341802
2,39856,Trump Just Got DESTROYED By J.K. Rowling For ...,0.319619
3,16178,Greedy Trump Cuts Takes Housing Money Away Fr...,0.319383
4,32036,"Trump, Pence master delicate art of dancing to...",0.290871
...,...,...,...
95,29741,Texas Cop Vows To Brutalize Any Trans Woman W...,0.036180
96,30162,"White House aims to speed U.S. drone, wireless...",0.034320
97,26891,LOL! HYPOCRITE HILLARY Gives Speech On Evils O...,0.033109
98,11709,CONTROVERSY AT SCHOOL After Brave Student Wear...,0.031433


In [93]:
# Get topic of query
def get_query_class(q):
  q = [q]
  q_vec = tfidf.transform(q).toarray()
  subj_class = multi_classifier.predict(q_vec)
  print(subj_class)
  return subj_class

In [94]:
query_topic = get_query_class(q1)

# Get similar documents from predicted label dataset
result_ids = list(ans.id)
ans_code = train_df[[True if i in result_ids else False for i in train_df.Id] ][["Id","subject_code","label"]]
ans_code.head()

[2]


Unnamed: 0,Id,subject_code,label
0,13970,2,1
2,26810,2,1
5,7209,5,1
7,41384,5,1
8,34361,6,0


In [95]:
docs_retreived = pd.merge(ans,ans_code, left_on=["id"], right_on=["Id"])
docs_retreived

Unnamed: 0,id,title,sim,Id,subject_code,label
0,20109,BUSTED: Steel Unions FURIOUS At Trump For Des...,0.345131,20109,2,1
1,33088,Leading Senator In Trump-Russia Investigation...,0.341802,33088,2,1
2,39856,Trump Just Got DESTROYED By J.K. Rowling For ...,0.319619,39856,2,1
3,16178,Greedy Trump Cuts Takes Housing Money Away Fr...,0.319383,16178,2,1
4,32036,"Trump, Pence master delicate art of dancing to...",0.290871,32036,6,0
...,...,...,...,...,...,...
95,29741,Texas Cop Vows To Brutalize Any Trans Woman W...,0.036180,29741,2,1
96,30162,"White House aims to speed U.S. drone, wireless...",0.034320,30162,6,0
97,26891,LOL! HYPOCRITE HILLARY Gives Speech On Evils O...,0.033109,26891,5,1
98,11709,CONTROVERSY AT SCHOOL After Brave Student Wear...,0.031433,11709,5,1


In [97]:
class_labs = list(docs_retreived.label)
fact_filtered = docs_retreived[[True if i == 1 else False for i in class_labs]]

subj_labs = list(docs_retreived.subject_code)
topic_filtered = docs_retreived[[True if i == query_topic else False for i in subj_labs]][["id","title"]]

topic_filtered

Unnamed: 0,id,title
0,20109,BUSTED: Steel Unions FURIOUS At Trump For Des...
1,33088,Leading Senator In Trump-Russia Investigation...
2,39856,Trump Just Got DESTROYED By J.K. Rowling For ...
3,16178,Greedy Trump Cuts Takes Housing Money Away Fr...
5,23254,Gawker Fools Trump Into Quoting Genocidal Man...
7,26810,Trump’s Excuse For All His Sexist Statements ...
8,13970,WATCH: Paul Manafort Gets DESTROYED On CNN Fo...
9,8318,Warren Buffett Knows Exactly Why Trump’s Camp...
10,5086,Billionaire Richard Branson Breaks Silence Ab...
11,32545,"CNN And MSNBC Destroy Trump, Black Out His Fa..."
