In [1]:
import re
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

Task 1

In [2]:
# reading dfs
train_df = pd.read_csv("fake_train.csv")
test_df = pd.read_csv("fake_test.csv")

In [3]:
train_df.head()

Unnamed: 0,text,label
0,Trump administration to review goal of world w...,1
1,Turkish academics to be tried in April over Ku...,1
2,Factbox: Italy's new electoral law offers a mi...,1
3,WATCH: Trump Get His A** Handed To Him By Chr...,0
4,Mexico president says Trump visit could have b...,1


In [4]:
test_df.head()

Unnamed: 0,text,label
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1


In [5]:
# finding total label
train_df["label"].unique()

array([1, 0])

In [6]:
# checking null if any
train_df.isna().sum()

text     0
label    0
dtype: int64

In [7]:
# checking null if any
test_df.isna().sum()

text     0
label    0
dtype: int64

Task 2

1. Keeping alphabets only as numbers and other characters do not contribute directly without context
2. Removing stopwords as it effects the abundance but not directly in word2vec, still removing to shrink the search space
3. To lowercase as it is just modeling the topics and tagging with some numerical value
4. Lemmatization: the core form of words are needed rather than the participle forms to find the relationships in this exercise
5. Doing the same pre-processing to test data too.

In [8]:
def clean_text(sentence: str) -> list:
    """cleaning text by extracting alphabets and
    then splitting into word list for each sentence

    Args:
        sentence (str): sentence string

    Returns:
        list: list of words.
    """
    pattern = re.compile(r"[A-Za-z]+")
    return re.findall(pattern=pattern, string=sentence)

def to_lowercase(word_list: list) -> list:
    """case changing of all contents in each list
    of words

    Args:
        word_list (list): list of words with alphabets only texts

    Returns:
        word_list (list): list of words with lowercase transformation
    """
    return [word.lower() for word in word_list]

def remove_stopwords(list_of_words: list) -> list:
    """removing stop words from list of words by matching
    English stop words and extracting those out 
    
    Args:
        list_of_words (list): list of words with stop words

    Returns:
        list: stop word free list of words
    """
    stopword_list = stopwords.words("english")

    return [word for word in list_of_words if word not in stopword_list]

def extract_word_lemma(word_list: list) -> list:
    """lemmatization of words list from word of list
    using NLTK WordNetLemmatizer class

    Args:
        word_list (list): list of string

    Returns:
        list: lemmatized list of strings
    """
    lemmatizer = WordNetLemmatizer()
    return [
        lemmatizer.lemmatize(word, tag[0].lower())
        if tag[0].lower() in ["n", "v", "a", "r", "s"]
        else word
        for word, tag in pos_tag(word_list)
    ]

In [9]:
# keeping alphabets only
train_df["cleaned_corpus"] = train_df["text"].apply(lambda text: clean_text(text) if not pd.isna(text) else "")
# removing stopwords
train_df["cleaned_corpus"] = train_df["cleaned_corpus"].apply(lambda word_list: remove_stopwords(word_list))
# to lowercase
train_df["cleaned_corpus"] = train_df["cleaned_corpus"].apply(lambda word_list: to_lowercase(word_list))
# extracting lemma
train_df["cleaned_corpus"] = train_df["cleaned_corpus"].apply(lambda word_list: extract_word_lemma(word_list))

In [10]:
# keeping alphabets only
test_df["cleaned_corpus"] = test_df["text"].apply(lambda text: clean_text(text) if not pd.isna(text) else "")
# removing stopwords
test_df["cleaned_corpus"] = test_df["cleaned_corpus"].apply(lambda word_list: remove_stopwords(word_list))
# to lowercase
test_df["cleaned_corpus"] = test_df["cleaned_corpus"].apply(lambda word_list: to_lowercase(word_list))
# extracting lemma
test_df["cleaned_corpus"] = test_df["cleaned_corpus"].apply(lambda word_list: extract_word_lemma(word_list))

Task 3

In [11]:
# corpus list initalization
corpus = []

In [12]:
# creating corpus from tagged documents
for k, df in enumerate([train_df, test_df]):
    print(f"[Info:] Working with dataframe {k+1}")
    for i, row in df.iterrows():
        corpus.append(
            TaggedDocument(
                words=row["cleaned_corpus"],
                tags=[row["label"]]
            )
        )

[Info:] Working with dataframe 1
[Info:] Working with dataframe 2


In [13]:
# sanity check
print("[Info:] Corpus size: ", len(corpus))
print("[Info:] Total size: ", len(train_df)+len(test_df))

[Info:] Corpus size:  36913
[Info:] Total size:  36913


In [14]:
# creating doc2vec object
doc2vec_window_4 = Doc2Vec(
    vector_size=300,
    window=4,
    workers=10
)

In [15]:
# initiating vocabs
doc2vec_window_4.build_vocab(corpus)

In [16]:
# training doc2vec
doc2vec_window_4.train(
    corpus,
    total_examples=doc2vec_window_4.corpus_count,
    epochs=10
)

Task 4

In [17]:
# inferring vectors by passing tokens of each row
vectors = []
for k, df in enumerate([train_df, test_df]):
    print(f"[Info:] Working with dataframe {k+1}")
    for i, row in df.iterrows():
        vectors.append(
            doc2vec_window_4.infer_vector(row["cleaned_corpus"])
        )

[Info:] Working with dataframe 1
[Info:] Working with dataframe 2


In [18]:
# sanity check vector sizes
vectors[0].shape

(300,)

In [19]:
# concat dataframe, train_df and then test_df
df = pd.concat([train_df, test_df])
df.head()

Unnamed: 0,text,label,cleaned_corpus
0,Trump administration to review goal of world w...,1,"[trump, administration, review, goal, world, w..."
1,Turkish academics to be tried in April over Ku...,1,"[turkish, academic, try, april, kurdish, lette..."
2,Factbox: Italy's new electoral law offers a mi...,1,"[factbox, italy, new, electoral, law, offer, m..."
3,WATCH: Trump Get His A** Handed To Him By Chr...,0,"[watch, trump, get, his, a, hand, to, him, by,..."
4,Mexico president says Trump visit could have b...,1,"[mexico, president, say, trump, visit, could, ..."


In [20]:
# including vectors as separate column for each row of 
# the concatenated dataframe
# df["vectors"] = vector_list
df["vectors"] = vectors
df.head()

Unnamed: 0,text,label,cleaned_corpus,vectors
0,Trump administration to review goal of world w...,1,"[trump, administration, review, goal, world, w...","[-0.60876584, 0.8974504, -0.22005816, 0.385045..."
1,Turkish academics to be tried in April over Ku...,1,"[turkish, academic, try, april, kurdish, lette...","[0.16721743, 0.9464363, -0.31198788, 0.6175166..."
2,Factbox: Italy's new electoral law offers a mi...,1,"[factbox, italy, new, electoral, law, offer, m...","[-0.311118, 0.9653612, -0.25668618, 0.3904911,..."
3,WATCH: Trump Get His A** Handed To Him By Chr...,0,"[watch, trump, get, his, a, hand, to, him, by,...","[-0.7879092, 0.15529503, 0.19538662, 1.0960517..."
4,Mexico president says Trump visit could have b...,1,"[mexico, president, say, trump, visit, could, ...","[-0.021175845, 0.5596713, -0.24676293, 0.57634..."


In [21]:
# construction of regression df just to isolate
regression_df = pd.DataFrame({"vectors": df["vectors"], "label": df["label"]})
regression_df.head()

Unnamed: 0,vectors,label
0,"[-0.60876584, 0.8974504, -0.22005816, 0.385045...",1
1,"[0.16721743, 0.9464363, -0.31198788, 0.6175166...",1
2,"[-0.311118, 0.9653612, -0.25668618, 0.3904911,...",1
3,"[-0.7879092, 0.15529503, 0.19538662, 1.0960517...",0
4,"[-0.021175845, 0.5596713, -0.24676293, 0.57634...",1


In [22]:
# training a logistic regression with default hyper params
# inital test set size 10%
X_train, X_test, y_train, y_test = train_test_split(
    regression_df["vectors"].tolist(),
    regression_df["label"],
    test_size=0.1
)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [23]:
# performing predictions
y_pred = logreg.predict(X_test)

In [24]:
# constructing confusion matrix
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[1871,   72],
       [  52, 1697]])

In [25]:
# calculating accuracy score
accuracy_score(y_pred=y_pred, y_true=y_test)

0.9664138678223185

In [26]:
# calculating f1 score
f1_score(y_pred=y_pred, y_true=y_test)

0.9647527003979534

Task 5

In [27]:
# creating corpus from tagged documents only in train data
train_corpus = []
for i, row in train_df.iterrows():
    train_corpus.append(
        TaggedDocument(
            words=row["cleaned_corpus"],
            tags=[row["label"]]
        )
    )

In [28]:
# creating doc2vec object on train corpus
doc2vec_window_4_train = Doc2Vec(
    vector_size=300,
    window=4,
    workers=10
)
# initiating vocabs
doc2vec_window_4_train.build_vocab(train_corpus)
# training doc2vec on train set
doc2vec_window_4_train.train(
    train_corpus,
    total_examples=doc2vec_window_4_train.corpus_count,
    epochs=10
)

In [29]:
# inferring vectors by passing tokens of each row
vectors_train = []
for i, row in train_df.iterrows():
    vectors_train.append(
        doc2vec_window_4_train.infer_vector(row["cleaned_corpus"])
    )
# creating vectors column for vectors
train_df["vectors"] = vectors_train
# regression df creation on train set
regression_df_train = pd.DataFrame(
    {"vectors": train_df["vectors"], "label": train_df["label"]}
)
regression_df_train.head()

Unnamed: 0,vectors,label
0,"[-0.005902393, 0.1592356, 0.07952905, 0.689933...",1
1,"[0.3238817, 0.07887071, -0.088781185, -0.16598...",1
2,"[-0.021988591, -0.0017752807, -0.070960835, -0...",1
3,"[0.07347778, -0.1422588, -0.05785784, 0.360505...",0
4,"[-0.14076675, 0.07505335, -0.074533395, 0.2981...",1


In [30]:
# on train set only, 
# training a logistic regression with default hyper params
# inital test set size 10%
X_train, X_test, y_train, y_test = train_test_split(
    regression_df_train["vectors"].tolist(),
    regression_df_train["label"],
    test_size=0.1
)
# intialising and training by fitting data
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [31]:
# performing predictions
y_pred = logreg.predict(X_test)

In [32]:
# constructing confusion matrix
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[582,  10],
       [  8, 523]])

In [33]:
# calculating accuracy score
accuracy_score(y_pred=y_pred, y_true=y_test)

0.9839715048975958

In [34]:
# calculating f1 score
f1_score(y_pred=y_pred, y_true=y_test)

0.9830827067669173

Accuracy score is almost same but around 2% in hold-out training