## Load and Preprocess data

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from unidecode import unidecode
from bs4 import BeautifulSoup
import scipy
from scipy import sparse

train = pd.read_csv('data/train.csv').fillna("")
test = pd.read_csv('data/test.csv').fillna("")

# we dont need ID columns
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
# create labels. drop useless columns
y = train.median_relevance.values
train = train.drop(['median_relevance', 'relevance_variance'], axis=1)

In [None]:
def remove_stuff(text_in):
    """
    Regular expression to remove stuff that I can think of
    """
    text_out = unidecode(BeautifulSoup(text_in).get_text())
    text_out = re.sub('-', ' ', text_out)
    text_out = re.sub('\n', ' ', text_out)
    text_out = re.sub(',', ' ', text_out)
    text_out = re.sub('\t', ' ', text_out)
    text_out = re.sub('/', ' ', text_out)
    text_out = re.sub(':', ' ', text_out)
    text_out = re.sub('\.', ' ', text_out)
    text_out = re.sub('\(', ' ', text_out)
    text_out = re.sub('\)', ' ', text_out)
    text_out = re.sub('"', " ", text_out)
    text_out = re.sub("'", " ", text_out)
    text_out = re.sub(r'\(.*?\)', '',text_out) # remove remnant stuff in parenthesis i.e. (tm), (r) - we won't remove actually...
    return text_out.lower()

In [None]:
train['product_title'] = train['product_title'].apply(remove_stuff)
train['product_description'] = train['product_description'].apply(remove_stuff)
test['product_title'] = test['product_title'].apply(remove_stuff)
test['product_description'] = test['product_description'].apply(remove_stuff)

In [None]:
from nltk import stem
porter = stem.PorterStemmer() # porter stemmer

def stem_string(text_in):
    text_out = ' '.join([porter.stem(i.strip()) for i in text_in.split(' ')])
    text_out = re.sub(r'\([^)]*\)', '', text_out) # remove stuff in parenthesis i.e. (tm), (r)
    return text_out

In [None]:
# stem string
train['query_stem'] = train['query'].apply(stem_string)
train['product_title_stem'] = train['product_title'].apply(stem_string)
train['product_description_stem'] = train['product_description'].apply(stem_string)

In [None]:
test['query_stem'] = test['query'].apply(stem_string)
test['product_title_stem'] = test['product_title'].apply(stem_string)
test['product_description_stem'] = test['product_description'].apply(stem_string)

## Prepare Matrix

In [None]:
# do some lambda magic on text columns
traindata = list(train.apply(lambda x:'%s %s %s' % (x['query_stem'],x['product_title_stem'], x['product_description_stem']),axis=1))
testdata = list(test.apply(lambda x:'%s %s %s' % (x['query_stem'],x['product_title_stem'], x['product_description_stem']),axis=1))

In [None]:
# the infamous tfidf vectorizer (Do you remember this one?)
tfv = TfidfVectorizer(min_df=3, max_features=20000, 
        strip_accents='unicode', analyzer='word',
        ngram_range=(1, 2), use_idf=1, smooth_idf=1,sublinear_tf=True,
        stop_words='english') # token_pattern=r'\w{1,}',

In [None]:
tfv.fit(traindata)

In [None]:
X =  tfv.transform(traindata)
X_test = tfv.transform(testdata)

## Add feature column to our prepared matrix

this is how we get first rank :) basically if every search terms occur in the product title (or description), I will append 1. Rather than that, I will append 0. Actually it's not that good since we can have partly text occur and people give high score

In [None]:
# see if query occur in title or not
occurence_train = []
for j in range(len(train)):
    count = 0
    for query_text in train.iloc[j]['query_stem'].split(' '):
        if query_text in train.iloc[j]['product_title_stem']:
            count += 1
    if count == len(train.iloc[j]['query_stem'].split(' ')):
        occurence_train.append(1)
    else:
        occurence_train.append(0)

In [None]:
# see if query occur in description or not
occurence_des_train = []
for j in range(len(train)):
    count = 0
    for query_text in train.iloc[j]['query_stem'].split(' '):
        if query_text in train.iloc[j]['product_description_stem']:
            count += 1
    if count == len(train.iloc[j]['query_stem'].split(' ')) or (train.iloc[j]['product_description_stem'] == '' and occurence_train == 1):
        occurence_des_train.append(1)
    else:
        occurence_des_train.append(0)

In [None]:
# see if query occur in title or not
occurence_test = []
for j in range(len(test)):
    count = 0
    for query_text in test.iloc[j]['query_stem'].split(' '):
        if query_text in test.iloc[j]['product_title_stem']:
            count += 1
    if count == len(test.iloc[j]['query_stem'].split(' ')):
        occurence_test.append(1)
    else:
        occurence_test.append(0)

In [None]:
# see if query occur in description or not
occurence_des_test = []
for j in range(len(test)):
    count = 0
    for query_text in test.iloc[j]['query_stem'].split(' '):
        if query_text in test.iloc[j]['product_description_stem']:
            count += 1
    if count == len(test.iloc[j]['query_stem'].split(' '))  or (test.iloc[j]['product_description_stem'] == '' and occurence_test == 1):
        occurence_des_test.append(1)
    else:
        occurence_des_test.append(0)

## Word length ratio

In [None]:
# see if query occur in title or not
occurence_ratio_train = []
for j in range(len(train)):
    count = 0
    m = float(len(train.iloc[j]['query_stem'].split(' ')))
    for query_text in train.iloc[j]['query_stem'].split(' '):
        if query_text.strip() in train.iloc[j]['product_title_stem']:
            count += 1
    occurence_ratio_train.append(count/m)

In [None]:
# see if query occur in title or not
occurence_ratio_test = []
for j in range(len(test)):
    count = 0
    m = float(len(test.iloc[j]['query_stem'].split(' ')))
    for query_text in test.iloc[j]['query_stem'].split(' '):
        if query_text.strip() in test.iloc[j]['product_title_stem']:
            count += 1
    occurence_ratio_test.append(count/m)

In [None]:
# X can be either from text with stemmer or without stemmer
X_occur = np.hstack((X.todense(), np.atleast_2d(np.array(occurence_ratio_train)).T, np.atleast_2d(np.array(occurence_des_train)).T))
X_occur = np.hstack((X_occur, np.atleast_2d(np.array(occurence_train)).T))
X_occur = sparse.csr_matrix(X_occur)

In [None]:
# X_test can either from text with stemmer or without stemmer
X_test_occur = np.hstack((X_test.todense(), np.atleast_2d(np.array(occurence_ratio_test)).T, np.atleast_2d(np.array(occurence_des_test)).T))
X_test_occur = np.hstack((X_test_occur, np.atleast_2d(np.array(occurence_test)).T))
X_test_occur = sparse.csr_matrix(X_test_occur)

In [None]:
# or try with simple Logistic regression
model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                           C=5.0, fit_intercept=True, intercept_scaling=1.0,
                           class_weight='auto', random_state=42)


# Fit Logistic Regression Model
model.fit(X_occur, y)
preds = model.predict(X_test_occur)


# Create your first submission file
submission = pd.DataFrame({"id": idx, "prediction": preds})
submission.to_csv("tf_idf_stem_occur_logistic_tuned.csv", index=False)

## This is how I use C=5.0 in logistic regression

In [None]:
%matplotlib inline
from sklearn import cross_validation
import matplotlib.pyplot as plt

In [None]:
#C_list = [0.1, 0.3, 1, 3, 10, 100, 1000]
C_list = [0.1, 0.3, 1, 3, 5, 6, 7, 10]

results = []
for C in C_list:
    print C
    model = LogisticRegression(penalty='l2', dual=True, tol=0.0001,
                               C=C, fit_intercept=True, intercept_scaling=1.0,
                               class_weight='auto', random_state=42)
    scores = cross_validation.cross_val_score(model, X_occur, y, cv=5)
    results.append([C, np.mean(scores)])

In [None]:
results = np.array(results)
plt.scatter(results[:,0], results[:,1])
plt.show()

In [None]:
X_occur[1][:-3].sum()

In [None]:
plt.hist(preds)
plt.show()

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
clf_2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)

clf_2.fit(X_occur, y)
preds_svm = clf_2.predict(X_test_occur)

In [None]:
plt.hist(preds_svm)
plt.show()

In [None]:
print(np.sum(preds != preds_svm))

In [None]:
np.sum(X_occur[:,4])

In [None]:
from __future__ import print_function

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X_occur, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'penalty': ['l2'], 'dual': [True, False],
                     'class_weight':['auto', None],
                     'C': [0.1, 0.5, 5.5, 5., 6., 7.]},
                    {'penalty': ['l1', 'l2'], 'dual': [False],
                     'class_weight':['auto', None],
                     'C': [0.1, 0.5, 5.5, 5., 6., 7.]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
from __future__ import print_function

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import BaggingClassifier

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X_occur, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [10], 
                     'max_samples': [0.3, 0.5, 0.8, 1.0],
                    'max_features': [0.3, 0.5, 0.8, 1.0]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(BaggingClassifier(n_jobs=8), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
from __future__ import print_function

from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X_occur, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [ 5,10,20], 
                      'class_weight':["auto", "subsample", None]}]
#                     {'n_estimators': [ 5, 10, 20]
#                      , 'class_weight':["auto", "subsample", None]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(n_jobs=8), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()