In [1]:
import pandas as pd
import numpy as np
import textstat as txst
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics


train_df = pd.read_csv('data/train.csv', encoding = 'latin-1')
train_df['rd_flesch_ease'] = train_df.excerpt.apply(txst.flesch_reading_ease)
train_df['rd_dalechall'] = train_df.excerpt.apply(txst.dale_chall_readability_score)
train_df['rd_colemanliau'] = train_df.excerpt.apply(txst.coleman_liau_index)

# These are the values we want to predict
targets = np.array(train_df['target'])

# Remove the targets from the set
# Convert pandas df to np arrays, saving the column headers
train_df = train_df.drop(['id', 'target', 'url_legal', 'license', 'excerpt'], axis = 1)
feature_list = list(train_df.columns)
features = np.array(train_df)

In [2]:
# Splitting data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, targets, test_size = 0.25, random_state = 20)
#### NOTE! I'm setting random_state to 20 so we get the same results every time we run the split.
#### Results are reproducible, but maybe this is not the best approach

In [3]:
# Sanity check: we expect the training features number of columns to match the testing feature number of columns 
# and the number of rows to match for the respective training and testing features and the labels
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (2125, 4)
Training Labels Shape: (2125,)
Testing Features Shape: (709, 4)
Testing Labels Shape: (709,)


In [4]:
# We establish baseline: We want our model to beat the mean of the standard error.

baseline_preds = test_features[:, feature_list.index('standard_error')]
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Average baseline error:  1.49


In [5]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 20)
#### NOTE! random_state again

# Train the model on training data
rf.fit(train_features, train_labels);

In [6]:
# Predictions on the test data
pred = rf.predict(test_features)

# Compute performance
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_labels, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(test_labels, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_labels, pred)))

Mean Absolute Error (MAE): 0.5577004881819099
Mean Squared Error (MSE): 0.5632883943672775
Root Mean Squared Error (RMSE): 0.7505254122061941
