In [2]:

# Import necessary libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
import string
import requests
from bs4 import BeautifulSoup
import re

import os
import sys
from IPython.display import display, HTML
import urllib
import gzip

import nltk
from nltk import tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import pip


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KarthikeyanNatarajan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KarthikeyanNatarajan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KarthikeyanNatarajan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def cleanString(review,stopWords):
    lemmatizer = WordNetLemmatizer()
    returnString = ""
    sentence_token = tokenize.sent_tokenize(review)
    idx_list = []
    for j in range(len(sentence_token)):
        single_sentence = tokenize.word_tokenize(sentence_token[j])
        sentences_filtered = [(idx,lemmatizer.lemmatize(w.lower())) for idx,w in enumerate(single_sentence) 
                              if w.lower() not in stopWords and w.isalnum()]
        idx_list.append([x[0] for x in sentences_filtered])
        word_list = [x[1] for x in sentences_filtered]
        returnString = returnString + ' '.join(word_list) + ' . '
    
    return returnString, idx_list

In [3]:
def split_df(dataframe, column_name, training_split = 0.6, validation_split = 0.2, test_split = 0.2):
    """
    Splits a pandas dataframe into trainingset, validationset and testset in specified ratio.
    All sets are balanced, which means they have the same ratio for each categorie as the full set.
    Input:   dataframe        - Pandas Dataframe, should include a column for data and one for categories
             column_name      - Name of dataframe column which contains the categorical output values
             training_split   - from ]0,1[, default = 0.6
             validation_split - from ]0,1[, default = 0.2        
             test_split       - from ]0,1[, default = 0.2
                                Sum of all splits need to be 1
    Output:  train            - Pandas DataFrame of trainset
             validation       - Pandas DataFrame of validationset
             test             - Pandas DataFrame of testset
    """
    if training_split + validation_split + test_split != 1.0:
        raise ValueError('Split paramter sum should be 1.0')
        
    total = len(dataframe.index)
 
    train = dataframe.reset_index().groupby(column_name).apply(lambda x: x.sample(frac=training_split))\
    .reset_index(drop=True).set_index('index')
    train = train.sample(frac=1)
    temp_df = dataframe.drop(train.index)
    validation = temp_df.reset_index().groupby(column_name)\
    .apply(lambda x: x.sample(frac=validation_split/(test_split+validation_split)))\
           .reset_index(drop=True).set_index('index')
    validation = validation.sample(frac=1)
    test = temp_df.drop(validation.index)
    test = test.sample(frac=1)
    
    print('Total: ', len(dataframe))
    print('Training: ', len(train), ', Percentage: ', len(train)/len(dataframe))
    print('Validation: ', len(validation), ', Percentage: ', len(validation)/len(dataframe))
    print('Test:', len(test), ', Percentage: ', len(test)/len(dataframe))

    return train, validation, test

In [78]:
stats=pd.read_csv('PythonQuestions.csv',encoding =  "ISO-8859-1")

In [79]:
stats=stats.dropna(subset=["Question"])

In [80]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
stats["Sub_Topic_Code"] = labelEncoder.fit_transform(stats["Sub_Topic_Code"])
stats = stats.rename(columns={'Sub_Topic_Code': 'Category', 'Question': 'Text'})

In [81]:
data_df=stats.sample(frac=1).reset_index(drop=True)

In [82]:
articles = []
n = data_df['Text'].shape[0]
col_number = data_df.columns.get_loc('Text')
stopWords = set(stopwords.words('english'))
data_cleaned = data_df.copy()
for i in range(n):
    temp_string,idx_string = cleanString(data_df.iloc[i,col_number],stopWords)
    articles.append(temp_string)
    print(str(i+1)+' of '+str(n)+" articles cleaned.",end='\r')
    
data_cleaned.loc[:,'Text'] = pd.Series(articles,index=data_df.index)
data_cleaned.loc[:,'Category'] = pd.Categorical(data_cleaned.Category)
data_cleaned['Code'] = data_cleaned.Category.cat.codes
categoryToCode = dict( enumerate(data_cleaned['Category'].cat.categories))

data_cleaned.head()

1 of 340 articles cleaned.2 of 340 articles cleaned.3 of 340 articles cleaned.4 of 340 articles cleaned.5 of 340 articles cleaned.6 of 340 articles cleaned.7 of 340 articles cleaned.8 of 340 articles cleaned.9 of 340 articles cleaned.10 of 340 articles cleaned.11 of 340 articles cleaned.12 of 340 articles cleaned.13 of 340 articles cleaned.14 of 340 articles cleaned.15 of 340 articles cleaned.16 of 340 articles cleaned.17 of 340 articles cleaned.18 of 340 articles cleaned.19 of 340 articles cleaned.20 of 340 articles cleaned.21 of 340 articles cleaned.22 of 340 articles cleaned.23 of 340 articles cleaned.24 of 340 articles cleaned.25 of 340 articles cleaned.26 of 340 articles cleaned.27 of 340 articles cleaned.28 of 340 articles cleaned.29 of 340 articles cleaned.30 of 340 articles cleaned.31 of 340 articles cleaned.32 of 340 articles cleaned.33 of 340 articles cleaned.34 of 340 articles cleaned.35 of 340 articles cleaned.36 of 340 articles cleaned.3

Unnamed: 0,Topic_Code,Category,Text,Code
0,AASKK37,22,set whether unique element set .,22
1,AASKK288,20,data oversampling .,20
2,AASKK108,10,python search list list .,10
3,AASKK171,16,panda pivot table .,16
4,AASKK19,18,ai machine learning big data computing big dat...,18


In [83]:
"""
Split Pandas Dataframe into train, validation and testset.
Convert data to keras conforming form
"""

print(categoryToCode)
train, validation, test = split_df(data_cleaned, 'Code',0.8,0.1,0.1)



{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25}
Total:  340
Training:  272 , Percentage:  0.8
Validation:  29 , Percentage:  0.08529411764705883
Test: 39 , Percentage:  0.11470588235294117


In [84]:
train

Unnamed: 0_level_0,Topic_Code,Category,Text,Code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
277,AASKK102,10,write list list file .,10
256,AASKK186,13,create 3 dimension matrix numpy like matlab .,13
295,AASKK207,14,outlier boxplot .,14
21,AASKK141,7,defining dynamic function string .,7
38,AASKK28,10,whether list mutable .,10
...,...,...,...,...
310,AASKK72,19,python multiline match .,19
152,AASKK287,2,get unique value column .,2
312,AASKK242,1,draw correlation heatmap .,1
246,AASKK238,23,quantile analysis python .,23


In [85]:
validation

Unnamed: 0_level_0,Topic_Code,Category,Text,Code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,AASKK128,5,dictionary python .,5
27,AASKK184,5,key instead dict key .,5
51,AASKK230,2,subset panda dataframe secondary index reassig...,2
200,AASKK116,5,merge two python dictionary single expression .,5
24,AASKK23,18,python high level language .,18
304,AASKK353,2,generate random dataframe .,2
138,AASKK46,0,class python .,0
226,AASKK312,20,generate random number python .,20
55,AASKK64,9,python different language .,9
40,AASKK07,18,python accessible .,18


In [86]:
test

Unnamed: 0,Topic_Code,Category,Text,Code
72,AASKK317,20,poisson distribution python .,20
59,AASKK31,25,tuple .,25
23,AASKK140,7,passing function multiple return value argumen...,7
268,AASKK265,4,create sample dataframe multiindex .,4
203,AASKK51,7,method different function .,7
150,AASKK150,3,apply value list python .,3
82,AASKK255,20,sample randomly panda dataframe .,20
233,AASKK187,13,curious result numpy function .,13
327,AASKK204,17,plot boxplot panda .,17
22,AASKK338,23,get memory usage statitsics dataframe .,23


In [87]:
def tfidf(data, ma = 0.6, mi = 0.0001):
    tfidf_vectorize = TfidfVectorizer(max_df = ma, min_df = mi)
    tfidf_data = tfidf_vectorize.fit_transform(data)
    return tfidf_data


In [21]:
def test_NaiveBayes(x_train, x_test, y_train, y_test):
    MNB = MultinomialNB()
    NBClassifier = MNB.fit(x_train, y_train)
    predictions = NBClassifier.predict(x_test)
    
    a = accuracy_score(y_test, predictions)
    p = precision_score(y_test, predictions, average = 'weighted')
    r = recall_score(y_test, predictions, average = 'weighted')
    return p, r

In [40]:
def test_SVM(x_train, x_test, y_train, y_test):
    SVM = SVC(kernel = 'linear')
    SVMClassifier = SVM.fit(x_train, y_train)
    predictions = SVMClassifier.predict(x_test)
    a = accuracy_score(y_test, predictions)
    p = precision_score(y_test, predictions, average = 'weighted')
    r = recall_score(y_test, predictions, average = 'weighted')
    return p, r

In [24]:
def test_NN(x_train, x_test, y_train, y_test):
    NN = MLPClassifier(solver = 'lbfgs', alpha = 0.00095, learning_rate = 'adaptive', learning_rate_init = 0.005, max_iter = 300, random_state = 0)
    Perceptron = NN.fit(x_train, y_train)
    predictions = Perceptron.predict(x_test)
    a = accuracy_score(y_test, predictions)
    p = precision_score(y_test, predictions, average = 'weighted')
    r = recall_score(y_test, predictions, average = 'weighted')
    return p, r

In [25]:
def test_SGD(x_train, x_test, y_train, y_test):
    SGD = SGDClassifier(loss = 'modified_huber')
    SGDC = SGD.fit(x_train1, y_train)
    predictions = SGDC.predict(x_test1)
    a = accuracy_score(y_test, predictions)
    p = precision_score(y_test, predictions, average = 'weighted')
    r = recall_score(y_test, predictions, average = 'weighted')
    return p, r

In [27]:
def test_voting(x_train, x_test, y_train, y_test):
    SVM = SVC(kernel = 'linear', probability = True)
    SGD = SGDClassifier(loss = 'modified_huber')
    EnsembleClassifier = VotingClassifier(estimators = [('sgd', SGD), ('svc', SVM)], voting = 'soft', weights = [1,1])
    EnsembleClassifier = EnsembleClassifier.fit(x_train, y_train)
    predictions = EnsembleClassifier.predict(x_test)
    a = accuracy_score(y_test, predictions)
    p = precision_score(y_test, predictions, average = 'weighted')
    r = recall_score(y_test, predictions, average = 'weighted')
    return p, r

In [37]:
x_test = tfidf(test["Text"])

In [30]:
import csv
import numpy as np
import string
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.decomposition import TruncatedSVD

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer,self).build_analyzer()
        return lambda doc:(english_stemmer.stem(word) for word in analyzer(doc))

In [88]:
import re

def build_tokenizer(doc):
    token_pattern=r"(?u)\b\w\w+\b"
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(doc)
posts_root1=[]
for post in train["Text"]:
    #print build_tokenizer(post)
    #print " ".join([english_stemmer.stem(word) for word in build_tokenizer(post)])
    posts_root1.append( " ".join([english_stemmer.stem(word) for word in build_tokenizer(post)]) )

print(posts_root1)
#posts_root = [ " ".join(english_stemmer.stem(word)) for doc in posts for word in build_tokenizer(doc)]

vectorizer_tfidf=StemmedTfidfVectorizer(ngram_range=(1, 3),lowercase=False,min_df=1,stop_words="english")



x_tfidf_test=vectorizer_tfidf.fit_transform(posts_root1)

print("feature_name:%s" % vectorizer_tfidf.get_feature_names())


['write list list file', 'creat dimens matrix numpi like matlab', 'outlier boxplot', 'defin dynam function string', 'whether list mutabl', 'method oper data class', 'correl datafram panda', 'boxplot interpret', 'set whether uniqu element set', 'uniform distibut python', 'data downsampl panda', 'creat datetim index panda', 'merg two python dictionari singl express', 'fill misss valu datatim format panda', 'multithread achiev python', 'creat custom function python', 'delet duplic dictionari list python', 'find locat item python list list', 'new instanc made class', 'generat random number python', 'dictionari', 'util panda datafram', 'creat sampl datafram datatim', 'method alway associ object class', 'creat panda seri', 'state class instanc', 'call function modul strong function name python', 'differ arithmat oper avail python', 'comment python', 'generat histogram python', 'panda', 'data oversampl', 'python advic beginn regex dictionari etc', 'differ deep shallow copi', 'type convers pyt

In [98]:
def tokenize_test(model,train,validation,vect):
    X_train_dtm = vect.fit_transform(train["Text"])
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(validation["Text"])
    model.fit(X_train_dtm, train["Code"])
    y_pred_class = model.predict(X_test_dtm)
    print("Training Accuracy")
    print(model.score(X_train_dtm,train["Code"]))
    print("Testing Accuracy")
    print(model.score(X_test_dtm,validation["Code"]))
    print(classification_report(y_pred_class,validation["Code"]))

In [100]:
tokenize_test(MultinomialNB(),train,validation,vectorizer_tfidf)

Features:  1302
Training Accuracy
0.6544117647058824
Testing Accuracy
0.5172413793103449
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00         1
           2       0.75      0.38      0.50         8
           3       0.00      0.00      0.00         0
           5       1.00      1.00      1.00         3
           6       0.00      0.00      0.00         0
           7       1.00      1.00      1.00         2
           9       0.00      0.00      0.00         0
          10       1.00      0.50      0.67         4
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          17       1.00      1.00      1.00         2
          18       0.33      0.14      0.20         7
          20       0.50      0.50      0.50         2
          23       0.00      0.00      0.00   

  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
tokenize_test(SVC(kernel = 'linear'),vectorizer_tfidf)

Features:  1302
Training Accuracy
0.9816176470588235
Testing Accuracy
0.6896551724137931
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       0.75      1.00      0.86         3
           3       0.00      0.00      0.00         0
           5       1.00      1.00      1.00         3
           6       1.00      0.50      0.67         2
           7       1.00      0.67      0.80         3
           9       0.00      0.00      0.00         0
          10       1.00      0.67      0.80         3
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         0
          17       1.00      1.00      1.00         2
          18       0.67      0.29      0.40         7
          20       0.50      0.50      0.50         2
          23       0.50      1.00      0.67   

  _warn_prf(average, modifier, msg_start, len(result))


In [93]:
mlp=MLPClassifier(solver = 'lbfgs', alpha = 0.00095, learning_rate = 'adaptive', learning_rate_init = 0.005, max_iter = 300, random_state = 0)
tokenize_test(mlp,vectorizer_tfidf)

Features:  1302
Training Accuracy
0.9963235294117647
Testing Accuracy
0.6896551724137931
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       0.75      1.00      0.86         3
           3       1.00      1.00      1.00         1
           5       0.67      1.00      0.80         2
           6       1.00      0.50      0.67         2
           7       1.00      0.67      0.80         3
           9       0.00      0.00      0.00         0
          10       1.00      0.67      0.80         3
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         0
          17       1.00      1.00      1.00         2
          18       0.67      0.33      0.44         6
          20       0.50      0.50      0.50         2
          23       0.50      1.00      0.67   

  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
SGD = SGDClassifier(loss = 'modified_huber')
tokenize_test(SGD,vectorizer_tfidf)

Features:  1302
Training Accuracy
0.9963235294117647
Testing Accuracy
0.6551724137931034
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       0.75      1.00      0.86         3
           3       0.00      0.00      0.00         0
           5       0.67      1.00      0.80         2
           6       1.00      0.50      0.67         2
           7       1.00      0.67      0.80         3
           9       0.00      0.00      0.00         0
          10       1.00      0.50      0.67         4
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         0
          17       1.00      1.00      1.00         2
          18       0.67      0.33      0.44         6
          20       0.50      0.50      0.50         2
          23       0.50      1.00      0.67   

  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
    SVM = SVC(kernel = 'linear', probability = True)
    SGD = SGDClassifier(loss = 'modified_huber')
    EnsembleClassifier = VotingClassifier(estimators = [('sgd', SGD), ('svc', SVM)], voting = 'soft', weights = [1,1])
    tokenize_test(EnsembleClassifier,vectorizer_tfidf)

Features:  1302
Training Accuracy
0.9963235294117647
Testing Accuracy
0.6206896551724138
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00         1
           2       0.75      1.00      0.86         3
           3       0.00      0.00      0.00         0
           5       0.67      1.00      0.80         2
           6       1.00      0.50      0.67         2
           7       1.00      0.67      0.80         3
           9       0.00      0.00      0.00         0
          10       1.00      0.50      0.67         4
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         0
          17       1.00      1.00      1.00         2
          18       0.67      0.29      0.40         7
          20       0.50      0.50      0.50         2
          23       0.50      1.00      0.67   

  _warn_prf(average, modifier, msg_start, len(result))
