In [1]:
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import EnglishStemmer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import sys
import nltk
import re
import progressbar

# Load Original Dataset:

In [None]:
data = pandas.read_csv('Chat_Text_Data.csv')
data.drop_duplicates(inplace=True)

In [None]:
stemmer = EnglishStemmer()
analyzer = CountVectorizer(preprocessor=lambda x: re.sub(r'(\d[\d\.])+', '', x.lower()),
                            stop_words='english',lowercase=True,ngram_range=(1,1),
                            token_pattern=u'(?u)\\b\\w\\w+\\b',analyzer='word').build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))
vectorizer = CountVectorizer(analyzer=stemmed_words,min_df=0.001)
res = vectorizer.fit_transform(data['Text'].values)

In [None]:
df = pandas.DataFrame(res.toarray(),columns=vectorizer.get_feature_names(),dtype=float)
df.drop_duplicates(inplace=True)
df['rationale']=""
df['label']=0
df['label'] = df['label'].astype('int')
df.to_csv('unlabeld_training.csv',index=False)

### code for merging old and new dataframes: (don't run)

In [None]:
df_old = pandas.read_csv('main_train_clean.csv',index_col=False)
df_new = df
#get column names that are nonzero from a row in a dataframe
def get_nonzero_cols(df,i):
    col_names =[]
    s = list(df.columns)
    s.pop()
    for col in s:
        if df.iloc[i][col]==1:
            col_names.append(col)
    return col_names
#check for matching row
def check_row(row,col_names):
    for col_name in col_names:
        try:
            meh = row[col_name]
            if meh != 1:
                return False
        except:
            pass
    return True
for i in range(df_old.shape[0]):
    print(i)
    for j in range(df_new.shape[0]):
        if check_row(df_new.iloc[j],get_nonzero_cols(df_old,i)):
            df_new.set_value(j,'rationale',df_old.iloc[i]['rationale'])
            df_new.set_value(j,'label',df_old.iloc[i]['label'])
            break

## Label initial training and test set manually: (DON'T RUN)

In [None]:
def label(df):
    for i in range(df.shape[0]):
        if i > 301:
            break  
        if df.iloc[i]['rationale']=='' :
            print(data.iloc[i]['Text'])
            word = input()
            if word == 'exit':
                break
            label = input()
            df.set_value(i,'label',label)
            df.set_value(i,'rationale',word)
    df.to_csv('labeled_instances_300.csv',index=False)

### fix format of dataframe:

In [None]:
df = pandas.read_csv('labeled_instances_300.csv',index_col=False,low_memory=False)
df = pandas.DataFrame.astype(df,dtype=float,errors='ignore')
df['label'] = df['label'].astype('int')
df.drop_duplicates(inplace=True)
df.fillna('',inplace=True)
#df.to_csv('labeled_instances_300.csv',index=False)

In [None]:
df_train=df[0:50]
df_test=df[50:300]

train_x,test_x,train_y,test_y = train_test_split(df.drop(['label'], axis =1),df['label'],random_state=42,test_size=0.33)

In [None]:
row = df_test.iloc[7]
print(row.size)
print(df_train.shape[0])
df_train = df_train.append(row,ignore_index=True)
print(df_train.shape[0])
print(df_test.shape[0])
df_test.drop(row,inplace=True)

In [None]:
clf = MultinomialNB()
clf.fit(df_train.drop(['label','rationale'], axis =1),df_train['label'])
lwor_acc =clf.score(df_test.drop(['label','rationale'], axis =1),df_test['label'])
print("accuracy with Lw/oR is ", lwor_acc)

In [2]:
def demphsizer(df,i, rationale):
    demphsizees = []
    s = list(df.columns)
    s.pop()
    for col in s:
        if df.iloc[i][col]==1 and not col==rationale:
            demphsizees.append(col)
    return demphsizees

In [3]:
def load_labeled_data():
    df = pandas.read_csv('labeled_instances_300.csv',index_col=False)
    df = pandas.DataFrame.astype(df,dtype=float,errors='ignore')
    df['label'] = df['label'].astype('int')
    df.drop_duplicates(inplace=True)
    return df

In [4]:
def apply_rationale(df_train,stemmer):
    bar = progressbar.ProgressBar(maxval=100, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    for i in range(df_train.shape[0]):
        bar.update((i*100//df_train.shape[0]))
        word = stemmer.stem(df_train.iloc[i]['rationale'])
        for col_name in demphsizer(df_train,i, word):
            df_train.set_value(i,col_name, 0.01)
    bar.finish()

In [None]:
stemmer = EnglishStemmer()
apply_rationale(df_train,stemmer)
clf = MultinomialNB(alpha=0.5)
clf.fit(df_train.drop(['label','rationale'], axis =1),df_train['label'])
lwr_acc = clf.score(df_test.drop(['label','rationale'], axis =1),df_test['label'])
print("accuracy with LwR is ", lwr_acc)

In [5]:
def train_MutliNB(df_train,df_test):
    clf = MultinomialNB()
    clf.fit(df_train.drop(['label','rationale'], axis =1),df_train['label'])
    lwr_acc = clf.score(df_test.drop(['label','rationale'], axis =1),df_test['label'].astype('int'))
    print("current accuracy with LwR is ", lwr_acc)
    return clf

In [6]:
def calculate_rationale(df, i,stemmer):
    rationale = stemmer.stem(df.iloc[i]['rationale'])
    for col_name in demphsizer(df,i, rationale):
        df.set_value(i,col_name, 0.01)

In [7]:
def find_most_uncertain_instance(clf,df_train,df_test,data):
    predicted_prob = clf.predict_proba(df_test.drop(['label','rationale'], axis =1))
    clearPRE = np.abs(np.add(predicted_prob[:,1],-0.5))
    index = np.argmin(clearPRE)
    obj = data.iloc[index+df_train.shape[0]]['Text']
    print(obj)
    return index

In [12]:
def interactive_learning():
    data = pandas.read_csv('Chat_Text_Data.csv')
    data.drop_duplicates(inplace=True)
    print("loading original data...")
    df = load_labeled_data()
    print("loading labeled data...")
    df_train=df[0:50]
    df_test=df[50:300]
    print("applying rationale on training data...")
    stemmer = EnglishStemmer()
    #apply_rationale(df_train,stemmer)
    print("training a MultinomialNB classifier...")
    clf = train_MutliNB(df_train,df_test)
    budget = 5
    print("interactive learning started")
    print("if you want to quit at any time, enter label as \'exit0\'")
    while budget >0 :    
        i = find_most_uncertain_instance(clf,df_train,df_test,data)
        print("current budget is ", budget)
        rationale = input()
        if rationale == 'exit0':
            print("goodbye, I am going to die.")
            break
        label = input()
        df_test.set_value(i,'label',label)
        df_test.set_value(i,'rationale',rationale)
        calculate_rationale(df_test, i,stemmer)
        row = df_test.iloc[i]
        df_train = df_train.append(row,ignore_index=True)
        df_test.drop(i,axis=0,inplace=True)
        clf = train_MutliNB(df_train,df_test)
        budget-=1

In [13]:
interactive_learning()

loading original data...
loading labeled data...
applying rationale on training data...
training a MultinomialNB classifier...
current accuracy with LwR is  0.638554216867
interactive learning started
if you want to quit at any time, enter label as 'exit0'
stupid lakers....
current budget is  5
stupid
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


current accuracy with LwR is  0.661290322581
well i'm gonna go enjoy being in seattle.....
current budget is  4
enjoy
1
current accuracy with LwR is  0.68016194332
Purdue Band sucks.
current budget is  3
sucks
0
current accuracy with LwR is  0.682926829268
boston is great: ).
current budget is  2
great
1
current accuracy with LwR is  0.69387755102
i love ucla!..
current budget is  1
love
1
current accuracy with LwR is  0.696721311475
