In [24]:
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Reading Data

In [25]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    # Function for reading file
    # input: filename as string
    # output: contents of file as list containing single words
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [26]:
d = []

for c in os.listdir("data_training"):
    directory = "data_training/" + c
    if c.startswith('.'):
        continue
    for f in os.listdir(directory):
        if f.startswith('.'):
            continue
        words = readFile(directory + "/" + f)
        e = {x: words.count(x) for x in words}
        e['_file'] = f
        e['_class'] = c
        d.append(e)

Create a dataframe from d - make sure to fill all the nan values with zeros.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html


In [27]:
df = pd.DataFrame(d).fillna(value = 0)
df.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Split data into training and validation set 

* Sample 80% of your dataframe to be the training data

* Let the remaining 20% be the validation data (you can filter out the indicies of the original dataframe that weren't selected for the training data)

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [28]:
train_df, test_df = sklearn.model_selection.train_test_split(df, test_size=.8)
train_df=df.sample(frac=0.8, random_state=200)
test_df=df.drop(train_df.index)

* Split the dataframe for both training and validation data into x and y dataframes - where y contains the labels and x contains the words

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [29]:
columns = list(set(df.columns) - {'_class', '_file'})
X_train, y_train = train_df[columns], train_df._class
X_test, y_test = test_df[columns], test_df._class

# Logistic Regression

#### Basic Logistic Regression
* Use sklearn's linear_model.LogisticRegression() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [30]:
# nonregularized
log_model = LogisticRegression(penalty='l2', C=10000)
log_model.fit(X_train, y_train)

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
log_model.score(X_test, y_test)

0.83437499999999998

In [32]:
log_model.coef_

array([[-0.01546583,  0.00401547, -0.03068038, ..., -0.02267432,
        -0.00450413,  0.0028457 ]])

#### Changing Parameters

In [33]:
# regularized
log_model = LogisticRegression(penalty='l2', C=1)
log_model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
log_model.score(X_test, y_test)

0.81874999999999998

In [35]:
log_model.coef_

array([[-0.00636629,  0.00194769, -0.01221163, ..., -0.00638716,
        -0.00149716,  0.00346259]])

#### Feature Selection
* In the backward stepsize selection method, you can remove coefficients and the corresponding x columns, where the coefficient is more than a particular amount away from the mean - you can choose how far from the mean is reasonable.

References:

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.where.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.std.html
https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.mean.html

In [36]:
features = pd.DataFrame(log_model.coef_)
total_mean = np.mean(features, axis = 1)
total_std = np.std(features, axis = 1)
normal_features = features.subtract(total_mean, axis = 0) \
                          .divide(total_std, axis = 0)
feat_selec = np.where(np.absolute(normal_features) > 0.3)[1]
print(total_mean)
print(total_std)
feat_selec.shape

0   -0.000229
dtype: float64
0    0.028671
dtype: float64


(18153,)

In [37]:
X_train_selec = X_train.drop(X_train.columns[feat_selec], axis = 1)
log_model = LogisticRegression(penalty = 'l2', C = 1)
log_model.fit(X_train_selec, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
X_test_selec = X_test.drop(X_test.columns[feat_selec], axis = 1)
log_model.score(X_test_selec, y_test)

0.66249999999999998

How did you select which features to remove? Why did that reduce overfitting?

To select which features to eliminate, we eliminated all features of which the absolute value of the normalized feature is greater than 0.3 standard deviations away from the mean. We used this to eliminate any features which either would overfit or underfit the data to the training data, in order to make it fit the test data more effectively. Eventually, it turned out that there was a loss of 1% in accuracy, but there was a reduction of about 13000 features, or slightly over 25% of the features, making this a simpler model. 

# Single Decision Tree

#### Basic Decision Tree

* Initialize your model as a decision tree with sklearn.
* Fit the data and labels to the model.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


In [39]:
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### Changing Parameters
* To test out which value is optimal for a particular parameter, you can either loop through various values or look into sklearn.model_selection.GridSearchCV

References:


http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [40]:
best_score_parameter_i = 0
best_score_i = 0
for i in range(-10, -4):
    d_tree = DecisionTreeClassifier(min_impurity_split= 10 ** i)
    d_tree = d_tree.fit(X_train, y_train)
    score = d_tree.score(X_test, y_test)
    if score > best_score_i:
        best_score_parameters_i = 10 ** i
        best_score_i = score
best_score_parameter_j = 0
best_score_j = 0
for j in range(1, 5):
    d_tree = DecisionTreeClassifier(min_samples_leaf= j)
    d_tree = d_tree.fit(X_train, y_train)
    score = d_tree.score(X_test, y_test)
    if score > best_score_j:
        best_score_parameter_j = j
        best_score_j = score
best_score_parameter_k = 0
best_score_k = 0
for k in range(2, 6):
    d_tree = DecisionTreeClassifier(min_samples_split= k)
    d_tree = d_tree.fit(X_train, y_train)
    score = d_tree.score(X_test, y_test)
    if score > best_score_k:
        best_score_parameter_k = k
        best_score_k = score
best_score_parameter_l = 0
best_score_l = 0
for l in range(5):
    d_tree = DecisionTreeClassifier(min_weight_fraction_leaf= l * 1.0 / 10)
    d_tree = d_tree.fit(X_train, y_train)
    score = d_tree.score(X_test, y_test)
    if score > best_score_l:
        best_score_parameter_l = l * 1.0 / 10
        best_score_l = score
best_score_parameter_m = 0
best_score_m = 0
for m in range(2):
    d_tree = DecisionTreeClassifier(presort=m)
    d_tree = d_tree.fit(X_train, y_train)
    score = d_tree.score(X_test, y_test)
    if score > best_score_m:
        best_score_parameter_m = m
        best_score_m = score
d_tree

print(best_score_parameter_i,
      best_score_parameter_j,
      best_score_parameter_k,
      best_score_parameter_l,
      best_score_parameter_m)



0 4 2 0.1 1


In [41]:
d_tree = DecisionTreeClassifier(min_impurity_split = best_score_parameter_i,
                                min_samples_leaf = best_score_parameter_j,
                                min_samples_split = best_score_parameter_k,
                                min_weight_fraction_leaf = best_score_parameter_l,
                                presort = best_score_parameter_m)
d_tree = d_tree.fit(X_train, y_train)
best_score = d_tree.score(X_test, y_test)
print(best_score)



0.640625


How did you choose which parameters to change and what value to give to them? Feel free to show a plot.

I used a for loop to run through the range of reasonable parameters, and individually picked out the best ones.
I changed all the parameters that had a numerical or boolean value

Why is a single decision tree so prone to overfitting?

Because decision trees can continue splitting up the parameter space to better match the data, it means they
will not stop until they have the perfect fitting unless specifically told otherwise.

# Random Forest Classifier

#### Basic Random Forest

* Use sklearn's ensemble.RandomForestClassifier() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [42]:
rf1 = sklearn.ensemble.RandomForestClassifier()

rf1.fit(X_train, y_train)
rf1.score(X_test, y_test)

0.68437499999999996

#### Changing Parameters

In [43]:
rf2 = sklearn.ensemble.RandomForestClassifier(n_estimators=100, criterion='entropy')

rf2.fit(X_train, y_train)
rf2.score(X_test, y_test)

0.8125

What parameters did you choose to change and why?

I increased `num_estimators`, or the number of trees in the forest, since more trees should lead to more reliable results that react better to outliers. In addition, I changed the `criterion` to entropy, since that is the criterion.

How does a random forest classifier prevent overfitting better than a single decision tree?

While the predictions of a single decision tree is highly sensitive to noise in the training set, aggregating the results of many uncorrelated trees makes this much less likely. In this way, using random forests significantly decreases the variance of the model.