In [16]:
import numpy as np
import pandas as pd
import os
import sklearn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Reading Data

In [2]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    # Function for reading file
    # input: filename as string
    # output: contents of file as list containing single words
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [3]:
d = []

for c in os.listdir("data_training"):
    if c != ".DS_Store":
        directory = "data_training/" + c
        for file in os.listdir(directory):
            words = readFile(directory + "/" + file)
            e = {x:words.count(x) for x in words}
            e['__FileID__'] = file
            e['__CLASS__'] = c
            d.append(e)

Create a dataframe from d - make sure to fill all the nan values with zeros.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html


In [4]:
data = pd.DataFrame.from_records(d)
data = data.fillna(value = 0)

#### Split data into training and validation set 

* Sample 80% of your dataframe to be the training data

* Let the remaining 20% be the validation data (you can filter out the indicies of the original dataframe that weren't selected for the training data)

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [5]:
training_data = data.sample(frac = 0.8, random_state = 200)
validation_data = data.drop(training_data.index)
training_data.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* Split the dataframe for both training and validation data into x and y dataframes - where y contains the labels and x contains the words

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [6]:
X_train = training_data.drop(['__CLASS__', '__FileID__'], axis=1)
y_train = training_data['__CLASS__']
X_valid = validation_data.drop(['__CLASS__', '__FileID__'], axis=1)
y_valid = validation_data['__CLASS__']

# Logistic Regression

#### Basic Logistic Regression
* Use sklearn's linear_model.LogisticRegression() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [7]:
logreg = LogisticRegression()
logmodel = logreg.fit(X_train, y_train)
logmodel.score(X_valid, y_valid)

0.81874999999999998

#### Changing Parameters

In [8]:
coef_normed = normalize(logmodel.coef_)
std = np.std(coef_normed)
mean = np.mean(coef_normed)

#not completely right yet
changed_logmodel = coef_normed[np.where((coef_normed <= mean + std) | (coef_normed >= mean - std))]
index_included = 0
list_indices_included = []
list_indices_not_included = []
coef_normed_access = coef_normed[0]
while index_included < len(coef_normed_access):
    if (coef_normed_access[index_included] >= mean - std) and (coef_normed_access[index_included] <= mean + std):
        list_indices_included.append(index_included)
    else:
        list_indices_not_included.append(index_included)
    index_included += 1
print(len(list_indices_included) + len(list_indices_not_included))

45671


#### Feature Selection
* In the backward stepsize selection method, you can remove coefficients and the corresponding x columns, where the coefficient is more than a particular amount away from the mean - you can choose how far from the mean is reasonable.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html# 

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.where.html

https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html

https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.mean.html

How did you select which features to remove? Why did that reduce overfitting?

Answer:
Remove features that are more than 1 standard deviation away from the mean of normalized coefficients. This reduces overfitting by removing outliers and features that are not as close to the decision boundary.

# Single Decision Tree

#### Basic Decision Tree

* Initialize your model as a decision tree with sklearn.
* Fit the data and labels to the model.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


In [9]:
dtc = DecisionTreeClassifier()
dtcmodel = dtc.fit(X_train, y_train)
dtcmodel.score(X_valid, y_valid)

0.625

#### Changing Parameters
* To test out which value is optimal for a particular parameter, you can either loop through various values or look into sklearn.model_selection.GridSearchCV

References:


http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [10]:
def mean(lst):
    size = len(lst)
    return sum(lst) / size

params = {"splitter":["best", "random"],
#           "max_depth":[None],
          "min_samples_split":[3,4,5,6],
          "min_samples_leaf":[2,3,4],
          "max_features":["auto", "sqrt", "log2", None],
#           "random_state":[None],
#           "max_leaf_nodes":[None],
#           "min_impurity_decrease":[0],
          "class_weight":["balanced", None],
          "presort":[True, False]}

results = {"splitter":None,
           "min_samples_split":None,
           "min_samples_leaf":None,
           "max_features":None,
           "class_weight":None,
           "presort":None}

split = params["splitter"]
mss = params["min_samples_split"]
msl = params["min_samples_leaf"]
mf = params["max_features"]
cw = params["class_weight"]
ps = params["presort"]

param_heads = ["splitter", "min_samples_split", "min_samples_leaf", "max_features", "class_weight", "presort"]
temp_results = []

for header in param_heads:
    temp_results = []
    for param_value in params[header]:
        dtc_temp = None
        if header == "splitter":
            dtc_temp = DecisionTreeClassifier(splitter=param_value)
        elif header == "min_samples_split":
            dtc_temp = DecisionTreeClassifier(min_samples_split=param_value)
        elif header == "min_samples_leaf":
            dtc_temp = DecisionTreeClassifier(min_samples_leaf=param_value)
        elif header == "max_features":
            dtc_temp = DecisionTreeClassifier(max_features=param_value)
        elif header == "class_weight":
            dtc_temp = DecisionTreeClassifier(class_weight=param_value)
        elif header == "presort":
            dtc_temp = DecisionTreeClassifier(presort=param_value)
        dtcmodel_temp = dtc_temp.fit(X_train, y_train)
        temp_results.append(dtcmodel_temp.score(X_valid, y_valid))
    results[header] = temp_results

print(results)

{'presort': [0.65312499999999996, 0.61875000000000002], 'splitter': [0.63437500000000002, 0.64375000000000004], 'min_samples_leaf': [0.66874999999999996, 0.59999999999999998, 0.61250000000000004], 'max_features': [0.58750000000000002, 0.58750000000000002, 0.57187500000000002, 0.63749999999999996], 'min_samples_split': [0.62187499999999996, 0.609375, 0.61875000000000002, 0.64687499999999998], 'class_weight': [0.63749999999999996, 0.66249999999999998]}


How did you choose which parameters to change and what value to give to them? Feel free to show a plot.

Why is a single decision tree so prone to overfitting?

# Random Forest Classifier

#### Basic Random Forest

* Use sklearn's ensemble.RandomForestClassifier() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [11]:
rfc = RandomForestClassifier(random_state = 0)
rfcmodel = rfc.fit(X_train, y_train)
rfcmodel.score(X_train, y_train)

0.99531250000000004

#### Changing Parameters

In [26]:
def mean(lst):
    size = len(lst)
    return sum(lst) / size

params = {#"splitter":["best", "random"],
#           "max_depth":[None],
          "min_samples_split":[3,4,5,6],
          "min_samples_leaf":[2,3,4],
          "max_features":["auto", "sqrt", "log2", None],
#           "random_state":[None],
#           "max_leaf_nodes":[None],
#           "min_impurity_decrease":[0],
          "class_weight":["balanced", None]}

results = {"min_samples_split":None,
           "min_samples_leaf":None,
           "max_features":None,
           "class_weight":None}

#split = params["splitter"]
mss = params["min_samples_split"]
msl = params["min_samples_leaf"]
mf = params["max_features"]
cw = params["class_weight"]
#ps = params["presort"]

param_heads = ["min_samples_split", "min_samples_leaf", "max_features", "class_weight"]#, "presort"]
temp_results = []

for header in param_heads:
    temp_results = []
    for param_value in params[header]:
        dtc_temp = None
        #if header == "splitter":
            #dtc_temp = RandomForestClassifier(splitter=param_value)
        if header == "min_samples_split":
            dtc_temp = RandomForestClassifier(min_samples_split=param_value)
        elif header == "min_samples_leaf":
            dtc_temp = RandomForestClassifier(min_samples_leaf=param_value)
        elif header == "max_features":
            dtc_temp = RandomForestClassifier(max_features=param_value)
        elif header == "class_weight":
            dtc_temp = RandomForestClassifier(class_weight=param_value)
        #elif header == "presort":
            #dtc_temp = RandomForestClassifier(presort=param_value)
        dtcmodel_temp = dtc_temp.fit(X_train, y_train)
        temp_results.append(dtcmodel_temp.score(X_valid, y_valid))
    results[header] = temp_results

print(results)

{'min_samples_split': [0.65312499999999996, 0.69687500000000002, 0.640625, 0.67812499999999998], 'max_features': [0.63124999999999998, 0.65625, 0.60312500000000002, 0.671875], 'class_weight': [0.67500000000000004, 0.67812499999999998], 'min_samples_leaf': [0.69687500000000002, 0.65312499999999996, 0.64375000000000004]}


What parameters did you choose to change and why?

How does a random forest classifier prevent overfitting better than a single decision tree?