# MSDS 7331 Data Mining: 1st random forest model

***  

Team: Andrew Abbott, Vivek Bejugama, Patrick McDevitt, Preeti Swaminathan



### Data Loading

#### Import Required Packages

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time

from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn import metrics as mt


#### Import Data from .csv file

In [28]:
data_dir = '../data/'
data_file = 'mashable_clean_dataset_for_LR_and_SVM.csv'

file_2_read = data_dir + data_file
df = pd.read_csv(file_2_read)  

#### Strip leading spaces from column names

An initial summary of the dataset attribute follows along with the simple statistics of the numeric attributes.

In [29]:
#df.info()

In [30]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_tokens_title,39644.0,10.398749,2.114037,2.0,9.0,10.0,12.0,23.0
num_keywords,39644.0,7.223767,1.90913,1.0,6.0,7.0,9.0,10.0
data_channel_is_lifestyle,39644.0,0.052946,0.223929,0.0,0.0,0.0,0.0,1.0
data_channel_is_entertainment,39644.0,0.178009,0.382525,0.0,0.0,0.0,0.0,1.0
data_channel_is_socmed,39644.0,0.058597,0.234871,0.0,0.0,0.0,0.0,1.0
kw_avg_min,39644.0,312.366967,620.783887,-1.0,141.75,235.5,357.0,42827.857143
kw_max_max,39644.0,752324.066694,214502.129573,0.0,843300.0,843300.0,843300.0,843300.0
kw_avg_max,39644.0,259281.938083,135102.247285,0.0,172846.875,244572.222223,330980.0,843300.0
weekday_is_monday,39644.0,0.16802,0.373889,0.0,0.0,0.0,0.0,1.0
weekday_is_tuesday,39644.0,0.186409,0.389441,0.0,0.0,0.0,0.0,1.0


# Model Creation

## Training and Testing Split


For training and testing purposes, we use 80% of the observations for training and 20% for testing. This process is repeated three times and uses the shuffle split cross validation method built into scikit-learn.

In [31]:
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  copy data frame to classification working data frame
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=


df_rand_forest = df.copy()

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  dataframe in which to record results of model metrics
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

results_table_labels = ['n_features', 'n_estimate', 'process_time', 'accuracy', 'recall', 'precision', 'f1_score']
df_results = pd.DataFrame(columns = results_table_labels)

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  separate X and y matrices 
# ...
# ...  convert to numpy matrices by calling 'values' on the pandas data frames
# ...  they are now simple matrices for compatibility with scikit-learn
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

if 'popular' in df_rand_forest:
    y = df_rand_forest['popular'].values         # set 'popular' as dependent
    del df_rand_forest['popular']                # remove from dataset
    X = df_rand_forest.values                    # use everything else for independent EVs

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  setup cross-validation in sklearn
# ...
# ...  split into training and test sets
# ....  --> 3 folds
# ...   --> 80% / 20% training / test
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

num_cv_iterations = 3

num_instances = len(y)

cv_object = ShuffleSplit(n_splits = num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)



***  

## c. __Random Forest__   

*** 



First, to prepare the data for the Random Forest classifier, the data is split into training and test using the indices in the Cross Validation object.

In [32]:
for train_indices, test_indices in cv_object.split(X, y): 
    X_train = X[train_indices]
    y_train = y[train_indices]    
    X_test = X[test_indices]
    y_test = y[test_indices]

In [33]:

print('\n----------------------------------------------------')
print('\tRandom Forest - --- --- --- --- --- --- --- ---')
print('----------------------------------------------------')
 
n_estimate = 10
for idepth in range(0, 20):
    
    n_estimate = n_estimate + 5
    tic = time.clock()

# ...  get classifier object

    classifier_rand_forest = RandomForestClassifier(n_estimators = n_estimate)

    classifier_rand_forest = classifier_rand_forest.fit(X_train, y_train)
    
    # ... print out process time

    toc = time.clock()
    
# ... test set predictions

    y_hat = classifier_rand_forest.predict(X_test)

# ... model classification metrics

    acc = mt.accuracy_score(y_test, y_hat)
    rec = mt.recall_score(y_test, y_hat)
    pre = mt.precision_score(y_test, y_hat)
    f1s = mt.f1_score(y_test, y_hat)
    conf = mt.confusion_matrix(y_test, y_hat)

#    print('\n----------------------------------------------------')
#    print('Accuracy  = %9.3f' % acc )
#    print('Recall    = %9.3f' % rec )
#    print('Precision = %9.3f' % pre )
#    print('F1 Score  = %9.3f' % f1s )
#    print('\n----------------------------------------------------')
#    print('Confusion matrix\n', conf)
#    print('----------------------------------------------------')

# ... add model metrics to results data frame

    new_row = [len(df_rand_forest.columns), n_estimate, toc-tic, acc, rec, pre, f1s]

    df_results.loc[len(df_results)] = new_row

    print(df_results)


----------------------------------------------------
	Random Forest - --- --- --- --- --- --- --- ---
----------------------------------------------------
   n_features  n_estimate  process_time  accuracy    recall  precision  \
0        39.0        15.0      1.726299  0.628453  0.617595   0.627486   

   f1_score  
0  0.622501  
   n_features  n_estimate  process_time  accuracy    recall  precision  \
0        39.0        15.0      1.726299  0.628453  0.617595   0.627486   
1        39.0        20.0      2.201617  0.626939  0.564709   0.640611   

   f1_score  
0  0.622501  
1  0.600270  
   n_features  n_estimate  process_time  accuracy    recall  precision  \
0        39.0        15.0      1.726299  0.628453  0.617595   0.627486   
1        39.0        20.0      2.201617  0.626939  0.564709   0.640611   
2        39.0        25.0      2.788183  0.639803  0.632596   0.638112   

   f1_score  
0  0.622501  
1  0.600270  
2  0.635342  
   n_features  n_estimate  process_time  accuracy

    n_features  n_estimate  process_time  accuracy    recall  precision  \
0         39.0        15.0      1.726299  0.628453  0.617595   0.627486   
1         39.0        20.0      2.201617  0.626939  0.564709   0.640611   
2         39.0        25.0      2.788183  0.639803  0.632596   0.638112   
3         39.0        30.0      3.320447  0.646614  0.594203   0.659610   
4         39.0        35.0      3.869851  0.650523  0.639715   0.650129   
5         39.0        40.0      4.407446  0.644722  0.610984   0.651220   
6         39.0        45.0      5.354454  0.652920  0.643529   0.652152   
7         39.0        50.0      6.013840  0.650271  0.612001   0.658730   
8         39.0        55.0      6.723190  0.652037  0.635901   0.653344   
9         39.0        60.0      7.284696  0.649262  0.619883   0.654672   
10        39.0        65.0      7.636483  0.649010  0.636664   0.649041   
11        39.0        70.0      9.298020  0.654559  0.629291   0.658946   
12        39.0        75.

    n_features  n_estimate  process_time  accuracy    recall  precision  \
0         39.0        15.0      1.726299  0.628453  0.617595   0.627486   
1         39.0        20.0      2.201617  0.626939  0.564709   0.640611   
2         39.0        25.0      2.788183  0.639803  0.632596   0.638112   
3         39.0        30.0      3.320447  0.646614  0.594203   0.659610   
4         39.0        35.0      3.869851  0.650523  0.639715   0.650129   
5         39.0        40.0      4.407446  0.644722  0.610984   0.651220   
6         39.0        45.0      5.354454  0.652920  0.643529   0.652152   
7         39.0        50.0      6.013840  0.650271  0.612001   0.658730   
8         39.0        55.0      6.723190  0.652037  0.635901   0.653344   
9         39.0        60.0      7.284696  0.649262  0.619883   0.654672   
10        39.0        65.0      7.636483  0.649010  0.636664   0.649041   
11        39.0        70.0      9.298020  0.654559  0.629291   0.658946   
12        39.0        75.