# Part 5: Creating the Final Model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

In [2]:
data = pd.read_csv('../data/data_with_letters.csv')

In [3]:
data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'user_id', 'sentence',
       'type_time', 'function_keys', 'topic', 'length', 'thethe', 'tha',
       'health', 'consenting', 'sandy', 'wouldn', 'hte', 'fof', 'considering',
       'walked', 'sentiment', 'about_gay', 'about_gun', 'contains_E',
       'contains_I', 'contains_T', 'contains_R', 'contains_O', 'contains_A',
       'contains_S', 'contains_N', 'contains_L'],
      dtype='object')

In [4]:
# Dropping columns that will not be used in either X or y
data.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 
                     'user_id', 'sentence', 'topic', 'about_gay', 'about_gun', 'length'],
         inplace = True)

In [5]:
X = data.drop(columns = ['type_time'])
y = data['type_time']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [7]:
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [8]:
lr = LinearRegression()
lr.fit(Xs_train, y_train)
lr.score(Xs_train, y_train)

0.8507275661015299

In [9]:
lr.score(Xs_test, y_test)

0.8545258872860904

In [10]:
dtr = DecisionTreeRegressor()
dtr.fit(Xs_train, y_train)
dtr.score(Xs_train, y_train)

0.999907148234461

In [11]:
dtr.score(Xs_test, y_test)

0.7140487128133897

In [12]:
br = BaggingRegressor()
br.fit(Xs_train, y_train)
br.score(Xs_train, y_train)

0.970523502231382

In [13]:
br.score(Xs_test, y_test)

0.849127458681961

In [14]:
rfr = RandomForestRegressor()
rfr.fit(Xs_train, y_train)
rfr.score(Xs_train, y_train)

0.9795546940577183

In [15]:
rfr.score(Xs_test, y_test)

0.8621494146217465

In [16]:
coef = pd.DataFrame(rfr.feature_importances_, columns = ['coef'], index = X.columns)
coef.sort_values('coef', ascending = False)

Unnamed: 0,coef
contains_E,0.569566
function_keys,0.123737
contains_O,0.063329
contains_T,0.054368
contains_A,0.052789
contains_I,0.042359
contains_N,0.022275
sentiment,0.020253
contains_S,0.018336
contains_R,0.015814


Since the 10 words with the highest impact on typing time have low coefficients, we can try to drop them and see if the model improves.

In [17]:
X.drop(columns = ['thethe', 'tha', 'health', 'consenting', 'sandy',
                  'wouldn', 'hte', 'fof', 'considering', 'walked'],
       inplace = True)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [19]:
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [20]:
rfr = RandomForestRegressor()
rfr.fit(Xs_train, y_train)
rfr.score(Xs_train, y_train)

0.9798118894374178

In [21]:
rfr.score(Xs_test, y_test)

0.8624630650025346

The accuracy only slightly increases for the training set and decreases for the test set, so it appears the 10 words help prevent overfitting.

## Conclusion

Out of every model we have seen in this project, the best-performing model is the random forest regressor with the predictors being the 10 most impactful words, the 9 most impactful letters, the number of function keys, and the sentiment.