In [1]:
import math
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import matplotlib.pyplot as plt # plotting
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('./facebook_with_reactions.csv')
outcomes = df[['Rating']]

## drop useless columns 
df = df.drop(columns=['Debate', 'status_link', 'permalink_url', 'Post URL', 
                      'status_message', 'link_name', 'share_count', 'Unnamed: 0', 'account_id',
                      'status_id', 'status_type', 'status_published', 'Date Published'])

## remove spaces in columns
df.columns = [c.replace(' ', '_') for c in df.columns]

df.Rating = df.Rating.replace(['mostly false', 'no factual content', 'mixture of true and false', 'mostly true'], [0,0,0,1])

df = pd.get_dummies(df)

In [2]:
df

Unnamed: 0,post_id,Rating,reaction_count,comment_count,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,...,Page_CNN Politics,Page_Eagle Rising,Page_Occupy Democrats,Page_Politico,Page_Right Wing News,Page_The Other 98%,Post_Type_link,Post_Type_photo,Post_Type_text,Post_Type_video
0,1035057923259100,0,146.0,15.0,145,14,0,115,7,0,...,0,0,0,0,0,0,0,0,0,1
1,1035269309904628,1,33.0,34.0,29,32,0,26,0,2,...,0,0,0,0,0,0,1,0,0,0
2,1035305953234297,1,63.0,27.0,61,24,34,49,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1035322636565962,1,170.0,86.0,163,80,33,88,13,3,...,0,0,0,0,0,0,1,0,0,0
4,1035352946562931,1,3188.0,2815.0,3156,2779,556,1855,722,21,...,0,0,0,0,0,0,0,0,0,1
5,1035366579894901,1,28.0,21.0,28,19,23,15,1,0,...,0,0,0,0,0,0,1,0,0,0
6,1035411183223774,1,409.0,105.0,399,103,44,324,44,1,...,0,0,0,0,0,0,0,0,0,1
7,1035430716555154,1,62.0,64.0,59,59,7,31,1,1,...,0,0,0,0,0,0,1,0,0,0
8,1035451683219724,1,39.0,6.0,39,5,7,38,1,0,...,0,0,0,0,0,0,1,0,0,0
9,1035453279886231,1,278.0,59.0,273,54,147,229,26,1,...,0,0,0,0,0,0,0,0,0,1


In [3]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    df.drop(columns='Rating'),
    df.Rating,
    test_size=0.20
)

In [4]:
logreg = LogisticRegression(solver='lbfgs', max_iter=1500)
rfe = RFECV(logreg)
rfe.fit(train_features, train_outcome)

RFECV(cv=None,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring=None, step=1, verbose=0)

In [5]:
cols = list(train_features.columns.values)
features = []
for bool, feature in zip(rfe.support_, cols):
    if bool:
        features.append(feature)
train_features = train_features[features]
test_features = test_features[features]

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline 

clf = KNeighborsClassifier()
knn_pipe = make_pipeline(MinMaxScaler(), clf)
knn_param_grid = {'kneighborsclassifier__n_neighbors': range(1, 10),
                  'kneighborsclassifier__weights': ['uniform', 'distance'],
                  'kneighborsclassifier__algorithm': ['ball_tree', 'kd_tree', 'brute']}

In [7]:
knn_grid = GridSearchCV(knn_pipe, param_grid=knn_param_grid)
knn_grid.fit(train_features, train_outcome)
knn_grid.score(test_features, test_outcome)

0.7534562211981567

In [8]:
knn_grid.best_params_

{'kneighborsclassifier__algorithm': 'brute',
 'kneighborsclassifier__n_neighbors': 5,
 'kneighborsclassifier__weights': 'distance'}