In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/processed/model_data/dataset.csv', encoding='utf8')
df

Unnamed: 0,likes_count,replies_count,retweets_count,user_followers,user_following,user_tweet_count,time_diff,char_count,special_char_count,user_freq,label
0,0.0,0.0,0.0,11048.0,12139.0,132952.0,85241.847222,248,9,3,fake
1,0.0,0.0,0.0,1.0,28.0,406.0,28880.900000,256,4,1,fake
2,20.0,9.0,0.0,1151.0,1263.0,12830.0,56963.965000,254,13,3,fake
3,1.0,3.0,0.0,,,,,299,15,1,fake
4,1.0,2.0,2.0,779.0,2408.0,17014.0,12471.530000,226,10,1,fake
...,...,...,...,...,...,...,...,...,...,...,...
75300,4.0,0.0,1.0,1671.0,2879.0,11472.0,2661.463333,188,12,1,real
75301,1.0,1.0,0.0,2313.0,861.0,138518.0,91662.934722,135,10,1,real
75302,0.0,1.0,0.0,1.0,26.0,241.0,5481.786389,205,7,1,real
75303,0.0,0.0,0.0,17.0,80.0,495.0,-1.683333,184,17,1,real


### Preprocessing

In [3]:
# Number of rows with missing values
df.isna().any().sum()

4

In [4]:
# Let's just drop missing value rows
df = df.dropna()
df

Unnamed: 0,likes_count,replies_count,retweets_count,user_followers,user_following,user_tweet_count,time_diff,char_count,special_char_count,user_freq,label
0,0.0,0.0,0.0,11048.0,12139.0,132952.0,85241.847222,248,9,3,fake
1,0.0,0.0,0.0,1.0,28.0,406.0,28880.900000,256,4,1,fake
2,20.0,9.0,0.0,1151.0,1263.0,12830.0,56963.965000,254,13,3,fake
4,1.0,2.0,2.0,779.0,2408.0,17014.0,12471.530000,226,10,1,fake
5,0.0,1.0,0.0,206.0,1921.0,5563.0,46492.581667,239,8,1,fake
...,...,...,...,...,...,...,...,...,...,...,...
75300,4.0,0.0,1.0,1671.0,2879.0,11472.0,2661.463333,188,12,1,real
75301,1.0,1.0,0.0,2313.0,861.0,138518.0,91662.934722,135,10,1,real
75302,0.0,1.0,0.0,1.0,26.0,241.0,5481.786389,205,7,1,real
75303,0.0,0.0,0.0,17.0,80.0,495.0,-1.683333,184,17,1,real


In [5]:
# Encode the label column
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()

df.loc[:,'label'] = lb.fit_transform(df['label'].to_numpy())
df['label']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


0        0
1        0
2        0
4        0
5        0
        ..
75300    1
75301    1
75302    1
75303    1
75304    1
Name: label, Length: 70956, dtype: int64

In [6]:
# Class balance looks ok
df['label'].value_counts()

0    37153
1    33803
Name: label, dtype: int64

In [7]:
# Shuffle data and convert to Numpy
from sklearn.model_selection import train_test_split

df = df.sample(frac = 1)
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1].to_numpy(), df['label'].to_numpy(), train_size=0.8)

### Modeling

In [8]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier as RF

#### Feature Selection

In [9]:
# Look at mutual information on train set
mutual_info = mutual_info_classif(X_train, y_train, discrete_features=False, random_state=0)
mutual_info = pd.Series(mutual_info, index=df.columns[df.columns != 'label']).sort_values(ascending=False)
mutual_info

user_tweet_count      0.048545
user_followers        0.031752
time_diff             0.026463
user_following        0.019628
user_freq             0.018395
char_count            0.016691
special_char_count    0.007339
retweets_count        0.003372
replies_count         0.002594
likes_count           0.000669
dtype: float64

#### Baseline LR

In [10]:
# Score with all features
clf = LR()
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
baseline_score = scores.mean()
baseline_score

0.11190037176691599

In [11]:
# Try with RF
rf_clf = RF()
scores = cross_val_score(rf_clf, X_train, y_train, cv=3, scoring='f1')
rf_score = scores.mean()
rf_score

0.6194553187875905

In [12]:
# Try with top 5 MI features
new_feature_set = ['user_tweet_count','user_followers','time_diff', 'char_count', 'user_following']
feature_idxs = [df.columns.get_loc(f) for f in new_feature_set]
X_train_MI = X_train[:,feature_idxs]

In [16]:
# Try again with baseline model
scores = cross_val_score(clf, X_train_MI, y_train, cv=3, scoring='f1')
score = scores.mean()
score

0.1122929683511198

In [17]:
# Try again with RF model
scores = cross_val_score(rf_clf, X_train_MI, y_train, cv=3, scoring='f1')
score = scores.mean()
score

0.5660976640824394

In [14]:
# Worse - let's look at PCA
pca = PCA().fit(X_train)
pcomps = pca.transform(X_train)
pc_col_names = ["PC"+str(i) for i in range(X_train.shape[1])]
pcomps_train = pd.DataFrame(pcomps, columns=pc_col_names)

pca.explained_variance_ratio_

array([9.86905302e-01, 1.15086668e-02, 1.49822928e-03, 8.73805674e-05,
       4.11875071e-07, 5.16964685e-09, 3.82021121e-09, 2.09251700e-10,
       2.46952168e-11, 2.24028611e-11])

In [15]:
# Looks like there is very high unexplained variance (~98%). These features are mostly noise.
# We are not going to get good results