In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
#from sklearn import cross_validation
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.feature_extraction import stop_words
from sklearn.metrics import f1_score
from nltk.stem.snowball import SnowballStemmer
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve
import string, re

%matplotlib inline

In [2]:
def run_model(train_data, test_data, vectorizer_type, binary_type, ngram, stop_word, model_type, max_t):
    X_train = train_data['question_text']
    Y_train = train_data['target']
    X_test = test_data['question_text']
    
    
    vectorizer = vectorizer_type(binary = binary_type, stop_words = stop_word, ngram_range=ngram)
    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    model = model_type
    model.fit(X_train_vectorized, Y_train)
    
    predictions = model.predict_proba(X_test_vectorized)[:,1]
    max_threshold = max_t
    targets = []
    for i in predictions:
        if i<= max_threshold:
            targets.append(0)
        else:
            targets.append(1)
    
    
    
    return(predictions, targets)

In [3]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'wouldnt':'would not',
                'isnt':'is not',
                'wouldnt':'would not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'neighbour': 'neighbor',
                'humour': 'humor',
                'apologise': 'apologize',
                'travelling':'traveling',
                'counselling':'counseling',
                'recognise': 'recognize',
                'theatre':'theater',
                'cancelled':'canceled',
                'travelled': 'traveled',
                'offence': 'offense',
                'licence': 'license',
                'labour':'labor',
                'behaviour': 'behavior',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social_media',
                'whatsapp': 'social_media',
                'snapchat': 'social_media',
                'facebook': 'social_media'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


def clean(text):
    # Remove puncuation
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    
    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"'m", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)
    text = re.sub('[0-9]{1}', '#', text)
    
    #Replace typical misspells
    text = replace_typical_misspell(text)
    
    #Stem words
    stemmer = SnowballStemmer('english')
    text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
#     stemmed_words = [stemmer.stem(word) for word in text]
#     text = " ".join(stemmed_words)
    
    return text

In [None]:
train_data = pd.read_csv('../input/train.csv', index_col = 0)
train_data['question_text'] = train_data["question_text"].apply(lambda x: clean(x))
test_data = pd.read_csv('../input/test.csv', index_col = 0)
test_data['question_text'] = test_data["question_text"].apply(lambda x: clean(x))

In [None]:
train_bestmodel = train_data.copy()
train_bestmodel['question_text'] = train_bestmodel["question_text"].apply(lambda x: clean(x))
test_bestmodel = test_data.copy()
test_bestmodel['question_text'] = test_bestmodel["question_text"].apply(lambda x: clean(x))

test = run_model(train_bestmodel, test_bestmodel, CountVectorizer, True, (1,4), None, LogisticRegression(), .1636935)

In [None]:
test_results = pd.DataFrame()
test_results['qid'] = test_data.index
test_results['prediction_percent'] = test[0]
test_results['prediction'] = test[1]

In [None]:
results = test_results.drop('prediction_percent', 1)
results.to_csv('submission.csv', index = False)