In [77]:
import pandas as pd
import numpy as np
from sklearn.tree import export_graphviz
import pydot
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
    
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

from nltk.stem.snowball import SnowballStemmer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [84]:
stop_words = set(stopwords.words('english')) 
stemmer = SnowballStemmer("english")
analyser = SentimentIntensityAnalyzer()

In [85]:
b = pd.read_csv("2_KS_train_w_vader.csv")

In [86]:
df = pd.read_csv("KS_train_final.csv")

In [87]:
df['blurb'] = b['blurb']

In [88]:
df['blurb'].fillna(" ", inplace=True)

In [89]:
for index, value in df['blurb'].items():
    s = analyser.polarity_scores(value)
    df.at[index, 'sentiment_com'] = s['compound']
    
    s = "".join([ c if c.isalpha() else " " for c in value ])
    s = s.lower()
    word_tokens = word_tokenize(s)
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(stemmer.stem(w))
            
    s = ' '.join(filtered_sentence)
    
    df.at[index, 'clean_blurb'] = s

In [90]:
df = df.drop(['blurb'], axis=1)

In [91]:
df.head(5)

Unnamed: 0,staff_pick,funded,is_cat_art,is_cat_music,is_cat_film,is_cat_technology,is_cat_publishing,is_cat_food,is_cat_games,is_cat_fashion,...,is_2018,sentiment_pos,sentiment_neu,sentiment_neg,blurb_bayes,log_goal_usd,log_days_to_launch,log_days_to_dealine,sentiment_com,clean_blurb
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.770405,0.536852,0.571877,0.636119,0.0,artist resid elsewher studio summer stretch wi...
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.913929,0.55716,0.417726,0.786681,0.0,artist public art make instal washington mall ...
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.6608,0.56167,0.269857,0.89016,0.7717,sequel favorit machin myphoneheng celebr commu...
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.99395,0.5729,0.285142,0.624541,0.34,film explor role valu art educ today histori s...
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.62379,0.608693,0.51524,0.713862,0.0,need build kitchen habit space self sustain fa...


In [92]:
X = df.drop(['funded', 'is_cat_music', 'is_cat_film', 'is_cat_publishing', 'is_cat_games', 'is_cat_fashion',
            'is_cat_comics', 'is_cat_other', 'is_country_us', 'is_loc_ca', 'is_loc_ny', 'is_loc_uk',
            'is_loc_tx', 'is_loc_cd', 'is_loc_fl', 'is_loc_il', 'is_loc_wa', 'is_loc_pa',
            'is_loc_other', 'is_2009', 'is_2017', 'is_2018', 'blurb_bayes', 'sentiment_pos', 'sentiment_neu',
            'sentiment_neg'], axis=1)

In [93]:
y = df['funded']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [95]:
X_NB_train = X_train.clean_blurb

In [96]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(X_NB_train.values)

In [97]:
classifier = MultinomialNB(alpha=1)
targets = y_train.values
classifier.fit(counts, targets)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [98]:
predictions = classifier.predict_proba(counts)
# i = 0

# for row in predictions:
#     X_train.loc[i, 'blurb_bayes'] = row[1]
#     i+=1

In [99]:
X_train['blurb_bayes'] = predictions.T[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [100]:
X_train = X_train.drop(['clean_blurb'], axis=1)

In [101]:
X_train.head(5)

Unnamed: 0,staff_pick,is_cat_art,is_cat_technology,is_cat_food,is_2010,is_2011,is_2012,is_2013,is_2014,is_2015,is_2016,log_goal_usd,log_days_to_launch,log_days_to_dealine,sentiment_com,blurb_bayes
53779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.512899,0.388768,0.713862,0.7003,0.028766
51976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532002,0.395099,0.713862,0.8516,0.979398
48953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.613578,0.417726,0.713862,0.0,0.218659
13609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.463114,0.510248,0.557388,0.0,0.996608
34131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.546403,0.465047,0.752808,-0.5994,0.853571


In [102]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, max_features='sqrt', random_state = 42)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [103]:
####### NAIVE BAYES on test data #########

counts = vectorizer.transform(X_test.clean_blurb.values)
predictions = classifier.predict_proba(counts)

In [104]:
X_test['blurb_bayes'] = predictions.T[1]
X_test = X_test.drop(['clean_blurb'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [105]:
rfc_predict = rfc.predict(X_test)

In [106]:
rfc_cv_score = cross_val_score(rfc, X_test, y_test, cv=10, scoring='roc_auc')

In [107]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[4195 1980]
 [1471 6700]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.74      0.68      0.71      6175
         1.0       0.77      0.82      0.80      8171

    accuracy                           0.76     14346
   macro avg       0.76      0.75      0.75     14346
weighted avg       0.76      0.76      0.76     14346



=== All AUC Scores ===
[0.82761254 0.84507809 0.84011875 0.83502672 0.83085366 0.82687184
 0.86179425 0.83034544 0.84825497 0.84675722]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8392713480978072


In [108]:
metrics.accuracy_score(y_test, rfc_predict)

0.7594451415028579

In [109]:
rfc_predict = rfc.predict(X_train)

In [110]:
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=10, scoring='roc_auc')

In [111]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_train, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_train, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[26420  8739]
 [ 6629 39500]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.80      0.75      0.77     35159
         1.0       0.82      0.86      0.84     46129

    accuracy                           0.81     81288
   macro avg       0.81      0.80      0.81     81288
weighted avg       0.81      0.81      0.81     81288



=== All AUC Scores ===
[0.8821139  0.87737291 0.87896444 0.88356883 0.87798943 0.88303434
 0.88093876 0.87936606 0.87691195 0.87868249]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8798943113434593


In [112]:
metrics.accuracy_score(y_train, rfc.predict(X_train))

0.8109438047436276

In [113]:
importances = list(rfc.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_test.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: blurb_bayes          Importance: 0.52
Variable: staff_pick           Importance: 0.12
Variable: log_goal_usd         Importance: 0.11
Variable: log_days_to_launch   Importance: 0.05
Variable: is_2013              Importance: 0.03
Variable: is_2015              Importance: 0.03
Variable: is_2011              Importance: 0.02
Variable: is_2012              Importance: 0.02
Variable: is_2014              Importance: 0.02
Variable: log_days_to_dealine  Importance: 0.02
Variable: is_cat_art           Importance: 0.01
Variable: is_cat_technology    Importance: 0.01
Variable: is_cat_food          Importance: 0.01
Variable: is_2016              Importance: 0.01
Variable: sentiment_com        Importance: 0.01
Variable: is_2010              Importance: 0.0
