In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import warnings
import model_report as mr
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
df_train = pd.read_csv('C:\\Users\\blgai\\OneDrive\\Documents\\School\\SMU\\Courses\\Fall 2021\\Capstone A\Data\\train_keywords_v1.csv')
df_test = pd.read_csv('C:\\Users\\blgai\\OneDrive\\Documents\\School\\SMU\\Courses\\Fall 2021\\Capstone A\Data\\test_keywords_v1.csv')
df_holdout = pd.read_csv('C:\\Users\\blgai\\OneDrive\\Documents\\School\\SMU\\Courses\\Fall 2021\\Capstone A\Data\\holdout_keywords_v1.csv')

In [6]:
#training data
X_train = df_train['clean_key_words']
y_train = df_train['category']

#test data
X_test = df_test['clean_key_words']
y_test = df_test['category']

#holdout data
X_holdout = df_holdout['clean_key_words']
y_holdout = df_holdout['category']

In [7]:
from sklearn import metrics
def get_metrics(true_labels, predicted_labels):
    my_accuracy = np.round(metrics.accuracy_score(true_labels,predicted_labels),4)
    my_precision = np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),4)
    my_TPR = np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),4)
    my_F1 = np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),4)
    
    return my_accuracy, my_precision, my_TPR, my_F1


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

svm = LinearSVC(penalty='l2',C=1,class_weight={'dr':8,'sx':10},random_state=1234)

feat_select_k = [500,800,1100,1400,1700,2000,2300,2600,2900]

#build BOW features on train corpus
tv = TfidfVectorizer(use_idf=True, min_df=25,max_df=.9, norm="l2",smooth_idf=True)
tv_train_features = tv.fit_transform(X_train)
#transform test corpus into features
tv_test_features = tv.transform(X_test)
#transform holdout corpus into features
tv_holdout_features = tv.transform(X_holdout)

print('TFIDF model:> Train features shape:',tv_train_features.shape,'Test features shape:',tv_test_features.shape,'Holdout features shape:',tv_holdout_features.shape)


#create an object to collect metrics for comparison
data_dict = []

#select optimal features, train, test, record metrics
for i in feat_select_k:
    
    #find optimal features of size k
    X_opt=SelectKBest(chi2, k=i)
    tv_train_features_trimmed = X_opt.fit_transform(tv_train_features, y_train)
    #create a data frame that includes all features and whether or not they are deemed important by feature_selection
    feat_imp = pd.DataFrame(list(zip(tv.get_feature_names(),X_opt.get_support().tolist())),columns = ['Features','Important'])
    #Now we can grab the important features so that we can select only the important features from the sparse matrix created by TfidfVectorizer
    imp_Feats_array = np.array(feat_imp[feat_imp.Important == True].index)
    #Now we can pull only the important features out of the original train, test, and holdout matrices
    tv_train_features_sub = tv_train_features[:,imp_Feats_array]
    tv_test_features_sub = tv_test_features[:,imp_Feats_array]
    tv_holdout_features_sub = tv_holdout_features[:,imp_Feats_array]
    
    #train model using only the optimal features
    svm.fit(tv_train_features_sub,y_train)
    #get efficacy metrics of trained model
    svm_tfidf_cv_scores = cross_val_score(svm,tv_train_features_sub,y_train,cv=5)
    svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
    svm_tfidf_test_score = svm.score(tv_test_features_sub,y_test)
    svm_predictions = svm.predict(tv_test_features_sub)
    
    accuracy, precision, tpr, f1 = get_metrics(true_labels=y_test,predicted_labels=svm_predictions)
    
    #store metrics in dictionary
    tmp_dict = {'No_Features':i,
                'Model':'SVM',
                'Class_Weights':'dr:8 and sx:10',
                'tfidf_mindf':25,
                'tfidf_maxdf':.9,
                'cv_5_mean_acc':svm_tfidf_cv_mean_score,
                'test_acc':accuracy,
                'precision':precision,
                'TPR/Recall':tpr,
                'F1 Score':f1
               }
    #append metrics from latest model to dictionary object
    data_dict.append(tmp_dict)

#create dataframe from dictionary object
df_overall = pd.DataFrame(data_dict)

#view all metrics
df_overall    


TFIDF model:> Train features shape: (29302, 2940) Test features shape: (8386, 2940) Holdout features shape: (4188, 2940)


Unnamed: 0,No_Features,Model,Class_Weights,tfidf_mindf,tfidf_maxdf,cv_5_mean_acc,test_acc,precision,TPR/Recall,F1 Score
0,500,SVM,dr:8 and sx:10,25,0.9,0.716538,0.7137,0.7128,0.7137,0.7085
1,800,SVM,dr:8 and sx:10,25,0.9,0.743567,0.7369,0.7344,0.7369,0.7334
2,1100,SVM,dr:8 and sx:10,25,0.9,0.757116,0.7555,0.7529,0.7555,0.7531
3,1400,SVM,dr:8 and sx:10,25,0.9,0.762371,0.761,0.7587,0.761,0.759
4,1700,SVM,dr:8 and sx:10,25,0.9,0.764521,0.7635,0.7615,0.7635,0.7618
5,2000,SVM,dr:8 and sx:10,25,0.9,0.7679,0.7659,0.7635,0.7659,0.7641
6,2300,SVM,dr:8 and sx:10,25,0.9,0.766774,0.7648,0.7631,0.7648,0.7635
7,2600,SVM,dr:8 and sx:10,25,0.9,0.764487,0.7648,0.7631,0.7648,0.7635
8,2900,SVM,dr:8 and sx:10,25,0.9,0.761996,0.7632,0.7616,0.7632,0.7619
9,500,SVM,dr:8 and sx:10,25,0.9,0.716538,0.7137,0.7128,0.7137,0.7085


In [15]:
import altair as alt

y_max = df_overall['test_acc'].max()
y_min = df_overall['test_acc'].min()

alt.Chart(df_overall).mark_line().encode(
    alt.Y('test_acc',scale=alt.Scale(domain=(y_min,y_max))),
    x='No_Features'
    
).properties(title="Change in SVM Test Accuracy based on Number of Features - Key Words")

In [13]:
df_overall[df_overall.test_acc == y_max]

Unnamed: 0,No_Features,Model,Class_Weights,tfidf_mindf,tfidf_maxdf,cv_5_mean_acc,test_acc,precision,TPR/Recall,F1 Score
5,2000,SVM,dr:8 and sx:10,25,0.9,0.7679,0.7659,0.7635,0.7659,0.7641
14,2000,SVM,dr:8 and sx:10,25,0.9,0.7679,0.7659,0.7635,0.7659,0.7641
23,2000,SVM,dr:8 and sx:10,25,0.9,0.7679,0.7659,0.7635,0.7659,0.7641
