In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/processed/model_data/dataset.csv', encoding='utf8')
df

Unnamed: 0,likes_count,replies_count,retweets_count,user_followers,user_following,user_tweet_count,time_diff,char_count,special_char_count,user_freq,label
0,0.0,0.0,0.0,11048.0,12139.0,132952.0,85241.847222,248,9,3,fake
1,0.0,0.0,0.0,1.0,28.0,406.0,28880.900000,256,4,1,fake
2,20.0,9.0,0.0,1151.0,1263.0,12830.0,56963.965000,254,13,3,fake
3,1.0,3.0,0.0,,,,,299,15,1,fake
4,1.0,2.0,2.0,779.0,2408.0,17014.0,12471.530000,226,10,1,fake
...,...,...,...,...,...,...,...,...,...,...,...
75300,4.0,0.0,1.0,1671.0,2879.0,11472.0,2661.463333,188,12,1,real
75301,1.0,1.0,0.0,2313.0,861.0,138518.0,91662.934722,135,10,1,real
75302,0.0,1.0,0.0,1.0,26.0,241.0,5481.786389,205,7,1,real
75303,0.0,0.0,0.0,17.0,80.0,495.0,-1.683333,184,17,1,real


### Preprocessing

In [3]:
# Number of rows with missing values
df.isna().any().sum()

4

In [4]:
# Let's just drop missing value rows
df = df.dropna()
df

Unnamed: 0,likes_count,replies_count,retweets_count,user_followers,user_following,user_tweet_count,time_diff,char_count,special_char_count,user_freq,label
0,0.0,0.0,0.0,11048.0,12139.0,132952.0,85241.847222,248,9,3,fake
1,0.0,0.0,0.0,1.0,28.0,406.0,28880.900000,256,4,1,fake
2,20.0,9.0,0.0,1151.0,1263.0,12830.0,56963.965000,254,13,3,fake
4,1.0,2.0,2.0,779.0,2408.0,17014.0,12471.530000,226,10,1,fake
5,0.0,1.0,0.0,206.0,1921.0,5563.0,46492.581667,239,8,1,fake
...,...,...,...,...,...,...,...,...,...,...,...
75300,4.0,0.0,1.0,1671.0,2879.0,11472.0,2661.463333,188,12,1,real
75301,1.0,1.0,0.0,2313.0,861.0,138518.0,91662.934722,135,10,1,real
75302,0.0,1.0,0.0,1.0,26.0,241.0,5481.786389,205,7,1,real
75303,0.0,0.0,0.0,17.0,80.0,495.0,-1.683333,184,17,1,real


In [5]:
# Encode the label column
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

df.loc[:,'label'] = lb.fit_transform(df['label'].to_numpy())
df['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


0        0
1        0
2        0
4        0
5        0
        ..
75300    1
75301    1
75302    1
75303    1
75304    1
Name: label, Length: 70956, dtype: int64

In [6]:
# Class balance looks ok
df['label'].value_counts()

0    37153
1    33803
Name: label, dtype: int64

In [7]:
df.describe()

Unnamed: 0,likes_count,replies_count,retweets_count,user_followers,user_following,user_tweet_count,time_diff,char_count,special_char_count,user_freq,label
count,70956.0,70956.0,70956.0,70956.0,70956.0,70956.0,70956.0,70956.0,70956.0,70956.0,70956.0
mean,18.469474,1.647204,3.670824,44647.2,1835.716078,36732.41,56843.306427,197.906054,10.465542,1.970573,0.476394
std,562.837335,23.5359,88.762061,898625.1,8939.723347,100319.6,36975.144846,67.714749,6.14941,4.450539,0.499446
min,0.0,0.0,0.0,0.0,0.0,1.0,-4.870833,21.0,0.0,1.0,0.0
25%,0.0,0.0,0.0,101.0,194.0,2929.75,20973.623889,155.0,7.0,1.0,0.0
50%,0.0,0.0,0.0,484.0,592.0,10361.0,60277.3225,214.0,10.0,1.0,0.0
75%,2.0,1.0,0.0,2158.0,1623.0,32255.0,90549.110694,239.0,13.0,2.0,1.0
max,98626.0,3475.0,11502.0,78292680.0,596082.0,4050363.0,133756.364444,918.0,107.0,79.0,1.0


In [8]:
# Shuffle data and convert to Numpy
from sklearn.model_selection import train_test_split

df = df.sample(frac = 1)
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1].to_numpy(), df['label'].to_numpy(), train_size=0.8)

### Modeling

In [9]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier as RF

#### Feature Selection

In [10]:
# Look at mutual information on train set
mutual_info = mutual_info_classif(X_train, y_train, discrete_features=False, random_state=0)
mutual_info = pd.Series(mutual_info, index=df.columns[df.columns != 'label']).sort_values(ascending=False)
mutual_info

user_tweet_count      0.049027
user_followers        0.028908
time_diff             0.025700
user_freq             0.020624
user_following        0.018527
char_count            0.012144
special_char_count    0.006168
retweets_count        0.005524
likes_count           0.002740
replies_count         0.000000
dtype: float64

#### Baseline LR

In [11]:
# Score with all features
clf = LR()
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
baseline_score = scores.mean()
baseline_score

0.11852275643963384

In [12]:
# Try with RF
rf_clf = RF()
scores = cross_val_score(rf_clf, X_train, y_train, cv=3, scoring='f1')
rf_score = scores.mean()
rf_score

0.6193533299640109

In [13]:
# Try with top 5 MI features
new_feature_set = ['user_tweet_count','user_followers','time_diff', 'char_count', 'user_following']
feature_idxs = [df.columns.get_loc(f) for f in new_feature_set]
X_train_MI = X_train[:,feature_idxs]

In [14]:
# Try again with baseline model
scores = cross_val_score(clf, X_train_MI, y_train, cv=3, scoring='f1')
score = scores.mean()
score

0.1287129887956008

In [15]:
# Try again with RF model
scores = cross_val_score(rf_clf, X_train_MI, y_train, cv=3, scoring='f1')
score = scores.mean()
score

0.5621555041718702

In [16]:
# Worse - let's look at PCA
pca = PCA().fit(X_train)
pcomps = pca.transform(X_train)
pc_col_names = ["PC"+str(i) for i in range(X_train.shape[1])]
pcomps_train = pd.DataFrame(pcomps, columns=pc_col_names)

pca.explained_variance_ratio_

array([9.85288681e-01, 1.28905368e-02, 1.72152190e-03, 9.88504918e-05,
       4.00696743e-07, 5.96305472e-09, 2.53021771e-09, 3.99221474e-10,
       2.83829945e-11, 2.56194418e-11])

In [17]:
# Try with just top 3 Pcomps
X_train_reduced = pcomps_train.iloc[:,:3]
X_train_reduced

Unnamed: 0,PC0,PC1,PC2
0,-43275.287300,-32519.023895,44085.854036
1,-43382.188997,-35899.682590,31829.515533
2,-43223.134217,-33700.203391,41955.957068
3,-4445.298508,-25518.490039,-44212.891948
4,-38523.474485,139868.424991,-37081.539323
...,...,...,...
56759,-43248.848126,-33232.526218,-15826.939521
56760,-43283.548942,-33970.404637,2810.002636
56761,-43386.904508,-39296.842636,51009.484530
56762,-43174.535616,-28606.726896,28074.600972


In [18]:
# Try again with top 3 PCs
scores = cross_val_score(clf, pcomps_train, y_train, cv=3, scoring='f1')
baseline_score = scores.mean()
baseline_score

0.526082726956395

In [19]:
scores = cross_val_score(rf_clf, pcomps_train, y_train, cv=3, scoring='f1')
rf_score = scores.mean()
rf_score

0.5947212623228805