In [1]:
import pandas as pd
import numpy as np

from sklearn.externals import joblib
import _pickle as pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier as RFC

dev_loc = r'/axp/buanalytics/csswcpfwt/dev/'

In [2]:
#pull the processed data
df_data = pd.read_csv(dev_loc+"/processed_data.csv")

#dropna, if missed in preprocess
df_data = df_data.dropna(how='any')

df_data.head(1)

Unnamed: 0,featr_nm,featr_desc,story_nm,story_ds,PFWT
1,france go balancing,ability balancing mechanism goand q inform q f...,go file creation carry,product manager want ensure status update file...,New Application Development / New App Dev Testing


In [3]:
# get x, y

i=0
for s in df_data.columns:
    if s == 'PFWT': continue
    if i==0:
        df_data["combined"] = df_data[s]
    else:
        df_data["combined"] = df_data["combined"] + df_data[s]
    i=i+1
    
#print(df_data.head(1))

x = df_data["combined"]
y = df_data['PFWT']
# print(x.head(1))


In [4]:
#vectorize
vectorizer = CountVectorizer()
vec_train = vectorizer.fit_transform(x)
feature_file = dev_loc+"/pfwt_vectors.pkl"
pickle.dump(vectorizer.vocabulary_, open(feature_file,"wb"))

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=pickle.load(open(feature_file, "rb")))
transformer = TfidfTransformer()
x_vec = transformer.fit_transform(loaded_vec.fit_transform(x))
print(x_vec.shape)


(48822, 110707)


In [5]:
import warnings
warnings.filterwarnings('ignore')

x_vec = x_vec.toarray()
# #split train and test
xtrain, xtest, ytrain, ytest = train_test_split(x_vec, y)
print(len(xtrain), len(ytrain)); print(len(xtest), len(ytest))

for n in [8]:
    
    print("Results for estimator size : "+str(n)); print("")
    
    classifier = RFC(bootstrap=True, class_weight=None, criterion='gini',
     max_depth=None, max_features='auto', max_leaf_nodes=None,
     min_impurity_split=1e-07, min_samples_leaf=1,
     min_samples_split=2, min_weight_fraction_leaf=0.0,
     n_estimators=n, n_jobs=1, oob_score=False, random_state=None,
     verbose=0, warm_start=False)

    classifier.fit(xtrain, ytrain)
    
    train_predictions = classifier.predict(xtrain)
    train_accuracy = accuracy_score(train_predictions, ytrain); print("training accuracy: ", train_accuracy); print("")
    print(classification_report(ytrain, train_predictions)); print("")

    test_predictions = classifier.predict(xtest)
    test_accuracy = accuracy_score(test_predictions, ytest); print("testing accuracy: ", test_accuracy); print("")
    print(classification_report(ytest, test_predictions)); print("")
    

36616 12206
36616 12206
Results for estimator size : 8

training accuracy:  0.9869729080183526

                                                   precision    recall  f1-score   support

New Application Development / New App Dev Testing       0.98      1.00      0.99     18277
     Other Non-Application Development activities       1.00      0.97      0.98     11039
                     Research & Development (R&D)       1.00      0.98      0.99      7300

                                        micro avg       0.99      0.99      0.99     36616
                                        macro avg       0.99      0.98      0.99     36616
                                     weighted avg       0.99      0.99      0.99     36616


testing accuracy:  0.8149270850401442

                                                   precision    recall  f1-score   support

New Application Development / New App Dev Testing       0.77      0.95      0.85      6130
     Other Non-Application Development ac

In [6]:
#save model
model_file = dev_loc+'/pfwt_model.sav'
joblib.dump(classifier, model_file)

['/axp/buanalytics/csswcpfwt/dev//pfwt_model.sav']