In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence, text
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import matplotlib.pyplot as pltå
import nltk
import sklearn
nltk.download('punkt')

In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser
import itertools

HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
#load main Midas labelled data table
helper.cd_main_data()
import pandas as pd

df = pd.read_csv('midas_labeled_data_Q12018.csv')
df['midas_final_unstructured'].fillna('No Score', inplace=True)

In [None]:
data = df['cleaned_note_unstructured'].sample(100).values

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,2), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(data))
data_ctv = ctv.transform(data)

In [None]:
#get the vocabulary
ctv.vocabulary_

In [None]:
# import the PCA module from sklearn
from sklearn.decomposition import PCA

#CAUTION PCA doesnt support sparse matrices
# use TruncatedSVD if you have sparse matrix


# intialise pca model and tell it to project data down onto 2 dimensions
pca = PCA(n_components=6)

# fit the pca model to our 300D data, this will work out which is the best 
# way to project the data down that will best maintain the relative distances 
# between data points. It will store these intructioons on how to transform the data.
pca.fit(data_ctv.tocsc())

# Tell our (fitted) pca model to transform our 300D data down onto 2D using the 
# instructions it learnt during the fit phase.
word_vecs_2d = pca.transform(data_ctv.tocsc())

# let's look at our new 2D word vectors
word_vecs_2d

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=1)
svd.fit(data_ctv)
data_svd = svd.transform(data_ctv)

In [None]:
df_svd = pd.DataFrame(data_svd, columns=['value'])
df_svd['index'] = df_svd.index.values

In [None]:
df_svd.head()

In [None]:
import matplotlib.pyplot as plt
# create a nice big plot 
plt.figure(figsize=(20,15))

# plot the scatter plot of where the words will be
plt.scatter(range(len(data)), data_svd[:,0])

# for each word and coordinate pair: draw the text on the plot
# for word, coord in zip(words, word_vecs_2d):
#     x, y = coord
#     plt.text(x, y, word, size= 15)

# show the plot
plt.show()

In [None]:

var_explained = svd.explained_variance_ratio_.sum()
var_explained

In [None]:
var_explained = svd.explained_variance_ratio_
var_explained

In [None]:
# create a nice big plot 
plt.figure(figsize=(20,15))

# plot the scatter plot of where the words will be
plt.scatter(word_vecs_2d[:,0], word_vecs_2d[:,1])

# for each word and coordinate pair: draw the text on the plot
for word, coord in zip(words, word_vecs_2d):
    x, y = coord
    plt.text(x, y, word, size= 15)

# show the plot
plt.show()

In [None]:
# Create a function
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

In [None]:
select_n_components(var_explained, 0.95)

In [None]:
# Load the data
from sklearn import datasets
digits = datasets.load_digits()
digits.data

In [None]:
%matplotlib inline 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt



f, (ax1, ax2, ax3, ax4) = plt.subplots(ncols=1, nrows=4, sharex=True, figsize = (13, 10))
sns.scatterplot(x="index", y="value", data=df_svd, ax=ax1)
sns.scatterplot(x="index", y="value",  data=df_svd, ax=ax2)
sns.scatterplot(x="index", y="value",  data=df_svd, ax=ax4)
sns.scatterplot(x="index", y="value",  data=df_svd, ax=ax3)
ax1.set_xlabel('') 

ax1.set_ylim(1000, 3000)
ax2.set_ylim(1, 8)
ax4.set_ylim(-0.1, 0.2)
ax3.set_ylim(0.2, 1)
plt.subplots_adjust(wspace=0.0, hspace=0.2)
plt.rcParams.update({'font.size': 22})

In [None]:
for i, j in enumerate(data_svd):
    if j > 1000:
        print(data[i].split())

In [None]:
#pretty confusion matrix
from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
%matplotlib inline
import numpy as np

def cal_percent(data):
    #normalize the confusion matrix
    d_max = sum([sum(i) for i in data])
    list_list = []
    for i in data:
        a_list = []
        for j in i:
            if j > 0:
                a_list.append((j/d_max)*100)
            else:
                a_list.append(j)
        list_list.append(a_list)
    return list_list

data =[[229 ,0   ,1   ,5   ,0   ,5],
 [  0  ,89  , 0  ,12  , 0  , 2],
 [  0   ,0 ,134  ,10   ,0   ,1],
 [  0  , 1  , 2 ,384   ,0   ,9],
 [  0  , 3  , 1   ,1   ,4   ,0],
 [  2  , 2  , 0  , 7   ,0 ,819]]

data = cal_percent(data)


y_true = ["No Score", "Grade I", "Grade IV", "Grade II", "Grade III", "Invalid"]
y_pred = ["No Score", "Grade I", "Grade IV", "Grade II", "Grade III", "Invalid"]
df_cm = pd.DataFrame(data, columns=y_true, index =y_pred)
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
plt.title("Confusion Matrix - Normalized(%)", fontsize=20)
sn.set(font_scale=1.2)#for label size
sn.heatmap(df_cm, cmap="Greens", annot=True,annot_kws={"size": 14})# font size
plt.show()

In [None]:
for i in df['cleaned_note_unstructured'].values:
    if 'assessment days' in i.lower():
        print(i)
        break
