In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR, NuSVR

from xgboost import XGBRegressor


In [2]:
# First we run the sript to generate all the variables we'll use

# Generate train file
#%run ../scripts/create_variables.py -f ../data/train.csv -c excerpt -nc cleaned_text -nf processed_analysed_train.csv
# Generate test file
#%run ../scripts/create_variables.py -f ../data/test.csv -c excerpt -nc cleaned_text -nf processed_analysed_test.csv

# Next we read the data
train_df = pd.read_csv('../data/outputs/processed_analysed_train.csv')
test_df = pd.read_csv('../data/outputs/processed_analysed_test.csv')

In [3]:
# Create the variables to predict and to train with
drop_feat = ['excerpt', 'cleaned_text', 'id', 'standard_error', 'target', 'url_legal', 'license']
X = train_df.drop(drop_feat, axis=1)
y = train_df['target']
X.head()

Unnamed: 0,friend,alway,light,you,name,end,carri,set,though,need,...,sentence_count,sentence_score,rd_automatedindex,rd_fogscale,rd_colemanliau,rd_flesch_ease,rd_linearwrite,rd_fleschkincaid_grade,rd_dalechall,rd_consensus
0,0.0,0.0,0.076923,0.0,0.0,0.75,0.0,0.0,0.0,0.0,...,11,1.3431,8.3,8.31,8.06,80.31,9.0,6.1,6.65,9.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12,1.5504,7.2,7.53,6.78,82.54,7.285714,5.2,5.92,8.0
2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,8,1.071,10.1,10.49,7.2,75.74,14.75,7.9,6.29,8.0
3,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,5,0.6693,16.4,13.61,8.54,72.02,12.5,11.4,6.61,7.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5,0.8666,11.8,11.76,4.83,75.47,13.5,10.0,1.57,12.0


### Support Vector Machines: SVR and NuSVR

In [5]:
#SVR models

# We leave out 'linear' and 'sigmoid' due to their bad results
svr_kernels = ['poly', 'rbf']
#gamma = np.arange(0.1, 1, 0.4)
gam=0.1

svr_pred =\
    [SVR(kernel=ker, C=100, gamma=gam, degree=3, epsilon=.1, coef0=1).fit(X, y).predict(X)\
     for ker in svr_kernels]

svr_acc = [mean_squared_error(y, y_pred) for y_pred in svr_pred]

#display([(ker, gam, acc) for ker in svr_kernels for gam in gamma for acc in svr_acc\
#         if acc <= 0.01])

for ker, acc in list(zip(svr_kernels, svr_acc)):
    print(ker + ": " + str(acc))

In [None]:
### NuSVR models

nusvr_kernels = ['linear', 'poly', 'rbf', 'sigmoid']

nusvr_pred =\
    [NuSVR(kernel=ker, C=100, gamma=0.1, degree=3, nu=.1, coef0=1).fit(X, y).predict(X)\
     for ker in nusvr_kernels]

nusvr_acc = [mean_squared_error(y, y_pred) for y_pred in nusvr_pred]

for ker, acc in list(zip(nusvr_kernels, nusvr_acc)):
    print(ker + ": " + str(acc))

In [None]:
# First let's separate the training data into train and test data
data_train, data_val, target_train, target_val = \
    train_test_split(train_df, train_df["target"], test_size=0.3, random_state=5)

# As before, drop irrelevant features or features that we do not need
X_train = data_train.drop(drop_feat, axis=1)
X_val = data_val.drop(drop_feat, axis=1)

In [None]:
xgbreg = XGBRegressor()
xgbreg.fit(X_train, target_train)

kfold = KFold(n_splits=5, random_state=7, shuffle=True)
results = cross_val_score(xgbreg, X_train, target_train, cv=kfold)

mse = mean_squared_error(y_test_pred, target_val)
print(mse)