# LinearSVC

In [46]:
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler       # scaling data
from sklearn.neighbors import KNeighborsRegressor    # regressor
from sklearn.model_selection import GridSearchCV     # for grid search
from sklearn.pipeline import make_pipeline           # for making pipelines
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore') 

## Import Data and Packages

In [25]:
import math
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import matplotlib.pyplot as plt # plotting
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
import altair as alt
alt.renderers.enable('notebook')

df = pd.read_csv('./facebook_with_reactions.csv')
outcomes = df[['Rating']]

## drop useless columns 
df = df.drop(columns=['Debate', 'status_link', 'permalink_url', 'Post URL', 
                      'status_message', 'link_name', 'share_count', 'Unnamed: 0', 'account_id',
                      'status_id', 'status_type', 'status_published'])

## remove spaces in columns
df.columns = [c.replace(' ', '_') for c in df.columns]

## Create copy of data for modeling
model_data = df.copy()

## change Rating to numeric for classification
model_data.Rating = model_data.Rating.replace(['mostly false', 'no factual content', 'mixture of true and false', 'mostly true'], [0,1,2,3]).astype(int)
model_data = model_data.drop(columns=['post_id', 'reaction_count'])
## make dummy columns (make categories numeric)
model_data = pd.get_dummies(model_data)

## Make test and validation data
df_target=model_data['Rating']
df_dropped=model_data
df_dropped=df_dropped.drop(columns=['Rating'])

## Handle Nulls

In [26]:
model_data.fillna(method='ffill',inplace=True)

## Feature Selection

In [30]:
from boruta import BorutaPy

In [50]:
df_corr = df_dropped.corr()
[(sorted(abs(x),reverse=True)[1],y) for (x,y) in zip(df_corr.values,df_corr.index) if sorted(abs(x),reverse=True)[1] > 0.]

[(0.9851900394574168, 'comment_count'),
 (0.9934614632397851, 'num_reactions'),
 (0.9851900394574168, 'num_comments'),
 (0.8796823438566503, 'num_shares'),
 (0.9934614632397851, 'num_likes'),
 (0.8499930450155352, 'num_loves'),
 (0.6848062079499695, 'num_wows'),
 (0.6070097024781741, 'num_hahas'),
 (0.7277387574661918, 'num_sads'),
 (0.7277387574661918, 'num_angrys'),
 (0.6183634672395973, 'Category_left'),
 (0.6208150489115978, 'Category_mainstream'),
 (0.6653217609163748, 'Category_right'),
 (0.3011850208124464, 'Page_ABC News Politics'),
 (0.5013979413359474, 'Page_Addicting Info'),
 (0.45560229277869924, 'Page_CNN Politics'),
 (0.6653217609163748, 'Page_Eagle Rising'),
 (0.6183634672395973, 'Page_Occupy Democrats'),
 (0.5415003234186943, 'Page_Politico'),
 (0.640983616820896, 'Page_Right Wing News'),
 (0.4639639937523467, 'Page_The Other 98%'),
 (0.18256921272417576, 'Date_Published_2016-09-19'),
 (0.18766955067898947, 'Date_Published_2016-09-20'),
 (0.18220239292646567, 'Date_Publ

In [38]:
#svc = LinearSVC()
#feat_selector = BorutaPy(svc, n_estimators='auto', verbose=2)
final_features = ['']

## Pipeline

In [43]:
# Split data into training and testing data
train_features, test_features, train_outcome, test_outcome = train_test_split(
   df_dropped,
   df_target,
   test_size=0.30
)

In [49]:
pipe = make_pipeline(MinMaxScaler(), LinearSVC())
param_grid = {}
grid = GridSearchCV(pipe, param_grid, cv = KFold(n_splits=10, shuffle=True))
grid.fit(train_features, train_outcome)
grid.score(test_features, test_outcome)

#df_selected_1= feature_selection(df_dropped,df_target,model_data)

0.7676923076923077