# 1. Data transformation

## Original Textual data

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Loading the original dataset
text = pd.read_csv(r"Datasets\Celebrity Dataset\Original dataset\fakenews_data.csv")

In [4]:
text

Unnamed: 0,Polarity,Text
0,Fake,Universities Agree with Trump on Immigration\n...
1,Fake,Chris Pratt responds to body shamers telling h...
2,Fake,"""Dancing With the Stars"": First couple won the..."
3,Fake,Brian Cox hosts scientist Postman Pat\n\nProfe...
4,Fake,London Stock Exchange has dropped Merger with ...
...,...,...
475,TRUE,Jason Witten signs new four-year deal with Dal...
476,TRUE,Google Maps can tell your friends exactly wher...
477,TRUE,Let there be light: German scientists test 'ar...
478,TRUE,Roger Federer beats Frances Tiafoe on return ...


In [5]:
text = text.replace("TRUE", "True")

## Sentiment scores

Aside from the vectorization performed with he TCT, we'll look at polarity and subjectivity scores just in case there are major differences between the two classes, as it is usual in fake news detection.

For the subjectivity score I will use Textblob and for the polarity scores I will use VADER as it gives a more granular insight into the sentiment in the text.

In [6]:
from textblob import TextBlob
import nltk
# nltk.download('vader_lexicon')
# nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [7]:
sent = SentimentIntensityAnalyzer()

text["Overall_Sentiment"] = text["Text"].apply(lambda row:sent.polarity_scores(row)["compound"])
text["Positive_Sentiment"] = text["Text"].apply(lambda row:sent.polarity_scores(row)["neg"])
text["Negative_Sentiment"] = text["Text"].apply(lambda row:sent.polarity_scores(row)["pos"])
text["Neutral_Sentiment"] = text["Text"].apply(lambda row:sent.polarity_scores(row)["neu"])

text["Subjectivity"] = text["Text"].apply(lambda row:TextBlob(row).sentiment[1])

In [8]:
text

Unnamed: 0,Polarity,Text,Overall_Sentiment,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Subjectivity
0,Fake,Universities Agree with Trump on Immigration\n...,0.7783,0.033,0.109,0.858,0.332143
1,Fake,Chris Pratt responds to body shamers telling h...,0.7763,0.045,0.106,0.849,0.513112
2,Fake,"""Dancing With the Stars"": First couple won the...",-0.2944,0.150,0.123,0.726,0.397129
3,Fake,Brian Cox hosts scientist Postman Pat\n\nProfe...,0.6486,0.000,0.032,0.968,0.497222
4,Fake,London Stock Exchange has dropped Merger with ...,0.9531,0.000,0.145,0.855,0.454167
...,...,...,...,...,...,...,...
475,True,Jason Witten signs new four-year deal with Dal...,0.6705,0.017,0.069,0.914,0.428052
476,True,Google Maps can tell your friends exactly wher...,0.9590,0.024,0.180,0.796,0.395543
477,True,Let there be light: German scientists test 'ar...,0.7964,0.000,0.090,0.910,0.620455
478,True,Roger Federer beats Frances Tiafoe on return ...,0.9532,0.089,0.175,0.736,0.473939


## Loading the data after the vectorization with the TCT

In [9]:
fake_df = pd.read_table(r'Datasets\Celebrity Dataset\Data Transformed\fake.tsv', delimiter = '\t')
true_df = pd.read_table(r'Datasets\Celebrity Dataset\Data Transformed\true.tsv', delimiter = '\t')

In [10]:
# print("FAKE \n ", fake_df.describe())
# print("TRUE \n ", true_df.describe())

In [11]:
if "text_key" in fake_df.columns:
    fake_df= fake_df.drop(labels="text_key", axis = 1) # categorical value will trigger an error when normalizing so we take this column out
    
if "text_key" in true_df.columns:   
    true_df = true_df.drop(labels="text_key", axis = 1)
    
# if "id" in fake_df.columns: 
#     fake_df= fake_df.drop(labels="id", axis = 1) 

# if "id" in true_df.columns: 
#     true_df = true_df.drop(labels="id", axis = 1)

In [12]:
#checking the columns are the same to verify we have the same features for both dfs

print(fake_df.columns == true_df.columns)


[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True]


In [13]:
#adding the polarity column

true_df["Polarity"] = "True"
fake_df["Polarity"] = "Fake"

#adding the DFs

dfs = [fake_df, true_df]
true_and_fake = pd.concat(dfs, axis = 0)

In [14]:
true_and_fake.shape

(480, 64)

In [15]:
# how to index the columns excluding the polarity 
true_and_fake.iloc[:, :-1]

Unnamed: 0,id,DESPC,DESSC,DESWC,DESPL,DESPLd,DESPLw,DESSL,DESSLd,DESWLsy,...,WORD_PROPERTY_WRDPOLc,WORD_PROPERTY_WRDHYPn,WORD_PROPERTY_WRDHYPv,WORD_PROPERTY_WRDHYPnv,WORD_PROPERTY_AOA,WORD_PROPERTY_AOA_MAX,WORD_PROPERTY_CONCRETENESS,WORD_PROPERTY_PREVALENCE,WORD_PROPERTY_PREVALENCE_MIN,WORD_SET_INCIDENCE_C4_COMMON_WORDS
0,1,5.0,2.0,98.0,1.000000,0.000000,48.500000,49.000000,1.414214,1.908163,...,8.266667,5.857143,1.250000,4.181818,6.196173,12.630000,2.523704,2.316489,2.316489,0.204082
1,2,3.0,5.0,126.0,2.500000,2.121320,60.500000,25.200000,13.535139,1.285714,...,7.216667,5.916667,1.421053,3.161290,4.892755,11.810000,2.688333,2.317815,2.317815,0.111111
2,3,3.0,6.0,132.0,3.000000,2.828427,65.000000,22.000000,11.933147,1.348485,...,7.113924,6.433333,0.941176,4.446809,5.274480,11.699415,2.743217,2.313201,2.313201,0.174242
3,4,3.0,5.0,140.0,2.500000,2.121320,69.500000,28.000000,10.653638,1.342857,...,6.701299,6.423077,0.923077,4.589744,5.012647,10.280000,2.840101,2.274963,2.274963,0.135714
4,5,3.0,7.0,137.0,3.500000,3.535534,65.000000,19.571429,6.827814,1.306569,...,8.724638,6.818182,1.550000,4.309524,5.335149,14.720000,2.391863,2.289871,2.289871,0.175182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,236,3.0,5.0,137.0,2.500000,2.121320,66.500000,27.400000,9.449868,1.394161,...,9.041096,6.230769,2.071429,4.775000,5.534232,11.786635,2.695100,2.292326,2.292326,0.167883
236,237,9.0,4.0,131.0,0.800000,0.447214,25.400000,32.750000,20.172176,1.244275,...,9.916667,6.043478,0.823529,3.825000,5.002707,11.940000,2.602743,2.319229,2.319229,0.129771
237,238,5.0,2.0,94.0,0.666667,0.577350,30.666667,47.000000,1.414214,1.468085,...,8.830508,6.545455,1.583333,4.794118,5.695342,14.620000,2.855897,2.310864,2.310864,0.106383
238,239,3.0,10.0,199.0,5.000000,4.242641,92.500000,19.900000,10.650509,1.366834,...,9.404040,6.178571,0.588235,4.066667,4.998212,12.890000,2.628047,2.312582,2.312582,0.090452


In [16]:
# Normalizing the data 

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

cols = list(true_and_fake.columns[:-1])

true_and_fake[cols] = scaler.fit_transform(true_and_fake[cols])

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


In [17]:
df_scaled = true_and_fake

In [18]:
df_scaled

Unnamed: 0,id,DESPC,DESSC,DESWC,DESPL,DESPLd,DESPLw,DESSL,DESSLd,DESWLsy,...,WORD_PROPERTY_WRDHYPn,WORD_PROPERTY_WRDHYPv,WORD_PROPERTY_WRDHYPnv,WORD_PROPERTY_AOA,WORD_PROPERTY_AOA_MAX,WORD_PROPERTY_CONCRETENESS,WORD_PROPERTY_PREVALENCE,WORD_PROPERTY_PREVALENCE_MIN,WORD_SET_INCIDENCE_C4_COMMON_WORDS,Polarity
0,0.000000,0.444444,0.000000,0.146154,0.027027,0.000000,0.151143,0.682139,0.000000,1.000000,...,0.404405,0.233333,0.363359,0.536598,0.396186,0.393986,0.674648,0.674648,0.685198,Fake
1,0.004184,0.222222,0.176471,0.253846,0.148649,0.176471,0.201948,0.249849,0.248429,0.109532,...,0.420646,0.278947,0.072002,0.057090,0.309322,0.612986,0.682585,0.682585,0.252618,Fake
2,0.008368,0.222222,0.235294,0.276923,0.189189,0.235294,0.220999,0.191726,0.215594,0.199330,...,0.561619,0.150980,0.439013,0.197521,0.297608,0.685996,0.654967,0.654967,0.546360,Fake
3,0.012552,0.222222,0.176471,0.307692,0.148649,0.176471,0.240051,0.300706,0.189370,0.191279,...,0.558820,0.146154,0.479820,0.101196,0.147246,0.814876,0.426085,0.426085,0.367094,Fake
4,0.016736,0.222222,0.294118,0.296154,0.229730,0.294118,0.220999,0.147614,0.110956,0.139367,...,0.666625,0.313333,0.399818,0.219840,0.617585,0.218604,0.515320,0.515320,0.550734,Fake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.983264,0.222222,0.176471,0.296154,0.148649,0.176471,0.227350,0.289808,0.164698,0.264674,...,0.506349,0.452381,0.532710,0.293080,0.306847,0.621988,0.530017,0.530017,0.516772,True
236,0.987448,0.888889,0.117647,0.273077,0.010811,0.037203,0.053345,0.386983,0.384460,0.050249,...,0.455246,0.119608,0.261489,0.097540,0.323093,0.499129,0.691053,0.691053,0.339440,True
237,0.991632,0.444444,0.000000,0.130769,0.000000,0.048029,0.075642,0.645812,0.000000,0.370429,...,0.592211,0.322222,0.538168,0.352350,0.606992,0.835890,0.640981,0.640981,0.230619,True
238,0.995816,0.222222,0.470588,0.534615,0.351351,0.352941,0.337426,0.153582,0.189306,0.225581,...,0.492107,0.056863,0.330483,0.095886,0.423729,0.532790,0.651265,0.651265,0.156495,True


In [19]:
df_scaled = df_scaled.drop(["id"], axis=1)

## Merging the vectorized TCT df and the Sentiment df

In [20]:
df_scaled = df_scaled.reset_index(drop=True)
df_scaled = pd.concat([df_scaled, text], axis =1)
df_scaled.columns

Index(['DESPC', 'DESSC', 'DESWC', 'DESPL', 'DESPLd', 'DESPLw', 'DESSL',
       'DESSLd', 'DESWLsy', 'DESWLsyd', 'DESWLlt', 'DESWLltd', 'LDTTRc',
       'LDTTRa', 'LDMTLD', 'LDHDD', 'SYNLE', 'SYNNP', 'SYNMEDpos', 'SYNMEDwrd',
       'SYNMEDlem', 'SYNSTRUTa', 'SYNSTRUTt', 'RDFRE', 'READFKGL',
       'TOKEN_ATTRIBUTE_RATIO_ALHPA', 'TOKEN_ATTRIBUTE_RATIO_DIGIT',
       'TOKEN_ATTRIBUTE_RATIO_PUNCT', 'TOKEN_ATTRIBUTE_RATIO_URL',
       'TOKEN_ATTRIBUTE_RATIO_EMAIL', 'WORD_SET_INCIDENCE_WRDPRP1s',
       'WORD_SET_INCIDENCE_WRDPRP1p', 'WORD_SET_INCIDENCE_WRDPRP2',
       'WORD_SET_INCIDENCE_WRDPRP3s', 'WORD_SET_INCIDENCE_WRDPRP3p',
       'WORD_SET_INCIDENCE_CNCCaus', 'WORD_SET_INCIDENCE_CNCLogic',
       'WORD_SET_INCIDENCE_CNCTemp', 'WORD_SET_INCIDENCE_CNCAdd',
       'WORD_SET_INCIDENCE_CNCPos', 'WORD_SET_INCIDENCE_CNCNeg',
       'WORD_PROPERTY_WRDNOUN', 'WORD_PROPERTY_WRDVERB',
       'WORD_PROPERTY_WRDADJ', 'WORD_PROPERTY_WRDADV', 'WORD_PROPERTY_WRDFRQc',
       'WORD_PROPERTY_WRDFRQa'

In [21]:
df_scaled = df_scaled.T.drop_duplicates().T #removing the duplicate Polarity column
del df_scaled["Text"]

df_scaled.columns

Index(['DESPC', 'DESSC', 'DESWC', 'DESPL', 'DESPLd', 'DESPLw', 'DESSL',
       'DESSLd', 'DESWLsy', 'DESWLsyd', 'DESWLlt', 'DESWLltd', 'LDTTRc',
       'LDTTRa', 'LDMTLD', 'LDHDD', 'SYNLE', 'SYNNP', 'SYNMEDpos', 'SYNMEDwrd',
       'SYNMEDlem', 'SYNSTRUTa', 'SYNSTRUTt', 'RDFRE', 'READFKGL',
       'TOKEN_ATTRIBUTE_RATIO_ALHPA', 'TOKEN_ATTRIBUTE_RATIO_DIGIT',
       'TOKEN_ATTRIBUTE_RATIO_PUNCT', 'TOKEN_ATTRIBUTE_RATIO_URL',
       'TOKEN_ATTRIBUTE_RATIO_EMAIL', 'WORD_SET_INCIDENCE_WRDPRP1s',
       'WORD_SET_INCIDENCE_WRDPRP1p', 'WORD_SET_INCIDENCE_WRDPRP2',
       'WORD_SET_INCIDENCE_WRDPRP3s', 'WORD_SET_INCIDENCE_WRDPRP3p',
       'WORD_SET_INCIDENCE_CNCCaus', 'WORD_SET_INCIDENCE_CNCLogic',
       'WORD_SET_INCIDENCE_CNCTemp', 'WORD_SET_INCIDENCE_CNCAdd',
       'WORD_SET_INCIDENCE_CNCPos', 'WORD_SET_INCIDENCE_CNCNeg',
       'WORD_PROPERTY_WRDNOUN', 'WORD_PROPERTY_WRDVERB',
       'WORD_PROPERTY_WRDADJ', 'WORD_PROPERTY_WRDADV', 'WORD_PROPERTY_WRDFRQc',
       'WORD_PROPERTY_WRDFRQa'

## Handling missing values

In [22]:
df_scaled.isna().sum().sort_values(ascending=False)       


SYNSTRUTt                      480
DESPLd                          30
DESPC                            0
WORD_PROPERTY_WRDADJ             0
WORD_PROPERTY_WRDFAMc            0
                              ... 
TOKEN_ATTRIBUTE_RATIO_EMAIL      0
WORD_SET_INCIDENCE_WRDPRP1s      0
WORD_SET_INCIDENCE_WRDPRP1p      0
WORD_SET_INCIDENCE_WRDPRP2       0
Subjectivity                     0
Length: 67, dtype: int64

We can see that the SYNSTRUTt feature is empty for all the rows so we will just delete that column. I also noticed later down the line that the feature TOKEN_ATTRIBUTE_RATIO_EMAIL only has 0 as a value, which was causing problems, so I will be dropping that feature as well.


As far as the DesPld feature, we will remove the rows that have empty values since there are not that many.

In [23]:
for col in df_scaled:
    if len(list(df_scaled[col].unique())) < 2:
        print(col)

SYNSTRUTt
TOKEN_ATTRIBUTE_RATIO_EMAIL


In [24]:
df_scaled = df_scaled.drop(labels=["SYNSTRUTt"], axis=1)
df_scaled = df_scaled.drop(labels=["TOKEN_ATTRIBUTE_RATIO_EMAIL"], axis=1)

In [25]:
df_scaled = df_scaled.dropna()

In [26]:
#Number of features and text

print("Number of features:", df_scaled.drop(labels=["Polarity"], axis=1).shape[1])
print("Number of texts:", df_scaled.drop(labels=["Polarity"], axis=1).shape[0])

Number of features: 64
Number of texts: 450


In [27]:
df_scaled

Unnamed: 0,DESPC,DESSC,DESWC,DESPL,DESPLd,DESPLw,DESSL,DESSLd,DESWLsy,DESWLsyd,...,WORD_PROPERTY_AOA_MAX,WORD_PROPERTY_CONCRETENESS,WORD_PROPERTY_PREVALENCE,WORD_SET_INCIDENCE_C4_COMMON_WORDS,Polarity,Overall_Sentiment,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Subjectivity
0,0.444444,0.0,0.146154,0.027027,0.0,0.151143,0.682139,0.0,1.0,0.563762,...,0.396186,0.393986,0.674648,0.685198,Fake,0.7783,0.033,0.109,0.858,0.332143
1,0.222222,0.176471,0.253846,0.148649,0.176471,0.201948,0.249849,0.248429,0.109532,0.116088,...,0.309322,0.612986,0.682585,0.252618,Fake,0.7763,0.045,0.106,0.849,0.513112
2,0.222222,0.235294,0.276923,0.189189,0.235294,0.220999,0.191726,0.215594,0.19933,0.134906,...,0.297608,0.685996,0.654967,0.54636,Fake,-0.2944,0.15,0.123,0.726,0.397129
3,0.222222,0.176471,0.307692,0.148649,0.176471,0.240051,0.300706,0.18937,0.191279,0.119235,...,0.147246,0.814876,0.426085,0.367094,Fake,0.6486,0.0,0.032,0.968,0.497222
4,0.222222,0.294118,0.296154,0.22973,0.294118,0.220999,0.147614,0.110956,0.139367,0.084885,...,0.617585,0.218604,0.51532,0.550734,Fake,0.9531,0.0,0.145,0.855,0.454167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0.222222,0.176471,0.296154,0.148649,0.176471,0.22735,0.289808,0.164698,0.264674,0.203276,...,0.306847,0.621988,0.530017,0.516772,True,0.6705,0.017,0.069,0.914,0.428052
476,0.888889,0.117647,0.273077,0.010811,0.037203,0.053345,0.386983,0.38446,0.050249,0.047174,...,0.323093,0.499129,0.691053,0.33944,True,0.959,0.024,0.18,0.796,0.395543
477,0.444444,0.0,0.130769,0.0,0.048029,0.075642,0.645812,0.0,0.370429,0.251662,...,0.606992,0.83589,0.640981,0.230619,True,0.7964,0.0,0.09,0.91,0.620455
478,0.222222,0.470588,0.534615,0.351351,0.352941,0.337426,0.153582,0.189306,0.225581,0.206593,...,0.423729,0.53279,0.651265,0.156495,True,0.9532,0.089,0.175,0.736,0.473939


In [28]:
df_scaled.columns

Index(['DESPC', 'DESSC', 'DESWC', 'DESPL', 'DESPLd', 'DESPLw', 'DESSL',
       'DESSLd', 'DESWLsy', 'DESWLsyd', 'DESWLlt', 'DESWLltd', 'LDTTRc',
       'LDTTRa', 'LDMTLD', 'LDHDD', 'SYNLE', 'SYNNP', 'SYNMEDpos', 'SYNMEDwrd',
       'SYNMEDlem', 'SYNSTRUTa', 'RDFRE', 'READFKGL',
       'TOKEN_ATTRIBUTE_RATIO_ALHPA', 'TOKEN_ATTRIBUTE_RATIO_DIGIT',
       'TOKEN_ATTRIBUTE_RATIO_PUNCT', 'TOKEN_ATTRIBUTE_RATIO_URL',
       'WORD_SET_INCIDENCE_WRDPRP1s', 'WORD_SET_INCIDENCE_WRDPRP1p',
       'WORD_SET_INCIDENCE_WRDPRP2', 'WORD_SET_INCIDENCE_WRDPRP3s',
       'WORD_SET_INCIDENCE_WRDPRP3p', 'WORD_SET_INCIDENCE_CNCCaus',
       'WORD_SET_INCIDENCE_CNCLogic', 'WORD_SET_INCIDENCE_CNCTemp',
       'WORD_SET_INCIDENCE_CNCAdd', 'WORD_SET_INCIDENCE_CNCPos',
       'WORD_SET_INCIDENCE_CNCNeg', 'WORD_PROPERTY_WRDNOUN',
       'WORD_PROPERTY_WRDVERB', 'WORD_PROPERTY_WRDADJ', 'WORD_PROPERTY_WRDADV',
       'WORD_PROPERTY_WRDFRQc', 'WORD_PROPERTY_WRDFRQa',
       'WORD_PROPERTY_WRDFRQmc', 'WORD_PROPERTY_W

Saving the transformed dataset we will use for the Models

In [29]:
df_scaled.to_csv(r"Datasets\Celebrity Dataset\Celebrity_dataset_transformed.csv", index=False)