In [10]:
import pandas as pd

In [15]:
full_data = pd.read_csv("qadata.csv", encoding="latin-1")

In [16]:
full_data.head()

Unnamed: 0,questionId,sectionId,lessonId,questionBody,questionType,authorId,questionResponse,questionlikecount,responselikecount,responseCount,isEndorsed,include,questionDate,Unnamed: 13
0,00a712ff-e146-437a-877c-4802ea0e5822,3603bfd4-7c3a-427f-b0ba-8a344025f329,G_98cb3946-2c76-45d8-b588-9672c900f350_3603bfd...,Do we need to remember the numbers of how much...,content,cad2791d-6298-45bc-b6dd-82aac0137420,General idea! And what potential effects that ...,0,0,1,0,0,2017-09-18 14:24:33,
1,00f8ce11-8d85-40bb-b08a-e79a688d8857,3603bfd4-7c3a-427f-b0ba-8a344025f329,G_98cb3946-2c76-45d8-b588-9672c900f350_3603bfd...,are the exams going to be at office hours today?,logistic,f2e2ceb1-0ac9-4148-8cd4-cf58d94e2dad,"As stated in the canvas announcement, yes they...",1,0,2,0,0,2017-10-18 14:03:46,
2,0135b77d-45d1-4e42-8a26-de5f5768560c,3603bfd4-7c3a-427f-b0ba-8a344025f329,G_98cb3946-2c76-45d8-b588-9672c900f350_3603bfd...,When will the participation grades for each le...,logistic,8df1c0a3-c67b-43f7-a789-75c77448918b,I'm not sure I'll ask Perry. I don't know if y...,0,1,1,0,0,2017-09-08 14:46:49,
3,015973d9-3b57-48f5-a0ef-24266a125956,3603bfd4-7c3a-427f-b0ba-8a344025f329,G_98cb3946-2c76-45d8-b588-9672c900f350_3603bfd...,"Is there any way to get rid of ""hide live"" not...",logistic,d5233013-29fb-4f29-b0e8-81d467757270,"Not sure, you can toggle the screens though an...",0,0,1,0,0,2017-09-15 14:45:07,
4,01f4105b-ac0f-4e37-b897-33b015dec191,3603bfd4-7c3a-427f-b0ba-8a344025f329,G_98cb3946-2c76-45d8-b588-9672c900f350_3603bfd...,slide 38 is still hidden,logistic,4feea5a4-eb6b-42b0-9d80-352b8e887c63,I don't think we ever got to that slide. it's ...,6,0,1,0,0,2017-09-18 14:54:36,


In [68]:
full_data = full_data[["questionBody", "questionType"]]
full_training = full_data[(full_data.questionType == "logistic") | (full_data.questionType == "content")]
training = full_training.sample(frac=0.75)
training.head()

Unnamed: 0,questionBody,questionType
39,when will the review powerpoint be released?,logistic
17,Why is there no cloud on the other side of the...,content
29,what exactly is the milankovitch theory of cli...,content
42,Where can we find cold clouds again?,content
37,"My computer will not play the live stream, it ...",logistic


In [55]:
categories = ['content', 'logistic']

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training.questionBody)
X_train_counts.shape

(33, 228)

In [57]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(33, 228)

In [58]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, training.questionType)

In [59]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(training.questionBody, training.questionType)

In [60]:
# Performance of NB Classifier
import numpy as np
predicted = text_clf.predict(full_training.questionBody)
np.mean(predicted == full_training.questionType)

0.9545454545454546

In [61]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(training.questionBody, training.questionType)
predicted_svm = text_clf_svm.predict(full_training.questionBody)
np.mean(predicted_svm == full_training.questionType)



0.9545454545454546

In [62]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [63]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(training.questionBody, training.questionType)

In [64]:
# To see the best mean score and the params, run the following code

gs_clf.best_score_
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [65]:
# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(training.questionBody, training.questionType)


gs_clf_svm.best_score_
gs_clf_svm.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [66]:
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [67]:
# Stemming Code
import nltk

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(training.questionBody, training.questionType)

predicted_mnb_stemmed = text_mnb_stemmed.predict(full_training.questionBody)

np.mean(predicted_mnb_stemmed == full_training.questionType)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


0.9545454545454546

In [69]:
to_predict = full_data[(full_data.questionType != "logistic") & (full_data.questionType != "content")]

In [70]:
text_to_predict = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_to_predict = text_to_predict.fit(full_training.questionBody, full_training.questionType)

predicted_to_predict = text_to_predict.predict(to_predict.questionBody)

In [73]:
to_predict['questionType'] = predicted_to_predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [75]:
to_predict

Unnamed: 0,questionBody,questionType
30,could you explain how to get to the correct an...,logistic
31,"if you won't be in class on Friday, will exam ...",logistic
45,Is anyone elses sound not working?,logistic
46,Does correctness of answer influence the grade?,logistic
48,howd he get 6000 ??????,content
49,the opening 2 questions wont load on my comput...,logistic
50,Where can I find the 'DIY Grading' ? I did not...,logistic
51,Is there any specific temperature that stage o...,content
52,What was the Exam average?,logistic
53,Are the lectures this week going to be on the ...,logistic
