In [137]:
import pandas as pd
import numpy as np

In [138]:
df = pd.read_csv('data-challenge-winners/data/train_great.csv', sep=';')

In [139]:
X = df['message'].values
y = df['y'].values

In [140]:
for i in range(X.shape[0]):
    if(pd.isnull(X[i])):
        X[i] = "PLSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS"

In [141]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
import time

In [142]:
X_vec = CountVectorizer(binary=True, ngram_range=(1, 1)).fit_transform(X)
params = {'alpha' : [0.4,0.45,0.5,0.55,0.6,0.7,0.8,0.9,1.]}
grid = GridSearchCV(BernoulliNB(), params)
grid.fit(X_vec, y)
print("best alpha = {} ; best score = {}".format(grid.best_params_['alpha'], grid.best_score_))

best alpha = 0.6 ; best score = 0.7805209513023783


In [6]:
X_vec = TfidfVectorizer(binary=False, ngram_range=(1, 2)).fit_transform(X)
params = {'alpha' : [0.,0.1,0.14,0.15,0.16,0.2,0.25,0.3,0.4,0.5]}
grid = GridSearchCV(MultinomialNB(), params)
grid.fit(X_vec, y)
print("best alpha = {} ; best score = {}".format(grid.best_params_['alpha'], grid.best_score_))

best alpha = 0.15 ; best score = 0.7626274065685165


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
         X_vec, y, test_size=0.20,)
clf = MultinomialNB(alpha=grid.best_params_['alpha']).fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]

T = [0.4,0.45,0.47,0.49,0.5,0.51,0.53,0.55,0.57,0.59,0.6]
for th in T:
    print("Threshold = {} ; Score = {}".format(th, sum((y_pred > th) == y_test)/y_test.shape[0]))

Threshold = 0.4 ; Score = 0.7146092865232163
Threshold = 0.45 ; Score = 0.7485843714609286
Threshold = 0.47 ; Score = 0.7497168742921857
Threshold = 0.49 ; Score = 0.7542468856172141
Threshold = 0.5 ; Score = 0.766704416761042
Threshold = 0.51 ; Score = 0.7746319365798414
Threshold = 0.53 ; Score = 0.7757644394110985
Threshold = 0.55 ; Score = 0.7734994337485843
Threshold = 0.57 ; Score = 0.7712344280860702
Threshold = 0.59 ; Score = 0.7655719139297849
Threshold = 0.6 ; Score = 0.7599093997734995


In [16]:
start = time.time()
X_vec = CountVectorizer(binary=True, ngram_range=(1, 2), stop_words='english').fit_transform(X)
params = {'C' : [0.1,0.5,1,2,3,4,5,7,8,9,10]}
grid = GridSearchCV(LogisticRegression(), params, cv = 5)
grid.fit(X_vec, y)
print("best C = {} ; best score = {}   in {} secondes".format(grid.best_params_['C'], grid.best_score_, time.time() - start))

best C = 4 ; best score = 0.8004530011325028   in 7.098598480224609 secondes


In [145]:
X_vec = CountVectorizer(binary=True, ngram_range=(1, 2)).fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
         X_vec, y, test_size=0.10,)
clf = LogisticRegression(C=5).fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]

T = [0.4,0.45,0.47,0.49,0.5,0.51,0.53,0.55,0.57,0.59,0.6]
for th in T:
    print("Threshold = {} ; Score = {}".format(th, sum((y_pred > th) == y_test)/y_test.shape[0]))

Threshold = 0.4 ; Score = 0.8122171945701357
Threshold = 0.45 ; Score = 0.8212669683257918
Threshold = 0.47 ; Score = 0.8235294117647058
Threshold = 0.49 ; Score = 0.8190045248868778
Threshold = 0.5 ; Score = 0.8190045248868778
Threshold = 0.51 ; Score = 0.8190045248868778
Threshold = 0.53 ; Score = 0.8190045248868778
Threshold = 0.55 ; Score = 0.8167420814479638
Threshold = 0.57 ; Score = 0.8144796380090498
Threshold = 0.59 ; Score = 0.8144796380090498
Threshold = 0.6 ; Score = 0.8144796380090498


In [28]:
X_vec = TfidfVectorizer(binary=True, ngram_range=(1, 2), stop_words='english').fit_transform(X)
params = {'C' : [50,100,200,500,700,1000]}
grid = GridSearchCV(LogisticRegression(), params)
grid.fit(X_vec, y)
print("best C = {} ; best score = {}".format(grid.best_params_['C'], grid.best_score_))

best C = 500 ; best score = 0.7868629671574179


In [161]:
start = time.time()
X_vec = CountVectorizer(binary=True, ngram_range=(1, 2)).fit_transform(X)
params = {'C' : [27.]}
grid = GridSearchCV(LogisticRegression(fit_intercept=True, class_weight={1 : np.sum(y_train)/n, 0 : 1 - np.sum(y_train)/n}), params, cv = 5)
grid.fit(X_vec, y)
print("best C = {} ; best score = {}   in {} secondes".format(grid.best_params_['C'], grid.best_score_, time.time() - start))

best C = 30 ; best score = 0.7970554926387315   in 2.9330592155456543 secondes


In [None]:
# best C = 27.0 ; best score = 0.8217440543601359   in 5.1147871017456055 secondes

In [116]:
# Avec intercept
df = pd.read_csv('data-challenge-winners/data/train_better.csv', sep=';')
X = df['message'].values
y = df['y'].values

X_vec = CountVectorizer(binary=True, ngram_range=(1, 1)).fit_transform(X)
n_samples, n_features = X_vec.shape
n = (n_samples*9)//10
X_train, y_train, X_valid, y_valid = X_vec[:n,:], y[:n], X_vec[n:,:], y[n:]
clf = LogisticRegression(tol=1e-7,C=6.,fit_intercept=True, class_weight={1 : np.sum(y_train)/n, 0 : 1 - np.sum(y_train)/n}).fit(X_train, y_train)
y_pred = clf.predict(X_valid)
print(np.sum(y_pred == y_valid)/(n_samples - n))

0.809954751131


In [37]:
y_test = (X_valid.dot(clf.coef_.T) + clf.intercept_) > 0.

In [38]:
np.sum(y_test != y_pred.reshape(n_samples - n,1))/(n_samples - n)

0.0

In [39]:
clf.intercept_

array([-1.52594777])

In [40]:
clf.coef_

array([[ -3.76985496e-03,   3.50238824e-03,  -3.76985496e-03, ...,
          7.53690493e-03,   4.38145461e-05,   6.97910047e-03]])

In [66]:
def val(X, Y, w, w0, C):
    n = X.shape[0]
    for i in range(n):
        if(Y[i] == 0.):
            Y[i] = -1.
    Y = Y.reshape(n,1)
    exp_vect = np.exp(-Y*(X.dot(w) + w0)).reshape(n,1)
    return C*np.sum(np.log(1. + exp_vect))/n + (np.sum(w**2) + w0**2)/2

def grad(X, Y, w, w0,C):
    n,p = X.shape
    for i in range(n):
        if(Y[i] == 0.):
            Y[i] = -1.
    Y = Y.reshape(n,1)
    exp_vect = np.exp(-Y*(X.dot(w) + w0)).reshape(n,1)
    coefs = (-Y*exp_vect/(1. + exp_vect))
    return [C*np.sum(coefs) + w0, C*((X.T).dot(coefs)).reshape(p,1)/n + w]

In [68]:
n,p = X_train.shape
w = np.zeros((p,1))
w0 = 0.
C = 6.
cur_grad = grad(X_train, y_train, w, w0, C)
norm_grad = np.sqrt(cur_grad[0]**2 + np.sum(cur_grad[1]**2))
eps = 7e-1
alpha = 0.0002
cmp = 0
while(norm_grad > eps):
    cmp += 1
    w0 -= alpha*cur_grad[0]
    w -= alpha*cur_grad[1]
    cur_grad = grad(X_train, y_train, w, w0, C)
    norm_grad = np.sqrt(cur_grad[0]**2 + np.sum(cur_grad[1]**2))
print(cmp)

[4575.0, array([[ 0.00151019],
       [ 0.0007551 ],
       [ 0.00151019],
       ..., 
       [-0.0007551 ],
       [ 0.0007551 ],
       [-0.0007551 ]])]
3


In [69]:
print(np.sum(((1./(1. + np.exp(-(X_valid.dot(w) + w0)))) > 0.5) == y_valid.reshape(y_valid.shape[0],1))/y_valid.shape[0])

0.717194570136


In [106]:
# Sans intercept
df = pd.read_csv('data-challenge-winners/data/train_better.csv', sep=';')
X = df['message'].values
y = df['y'].values

X_vec = CountVectorizer(binary=True, ngram_range=(1, 1)).fit_transform(X)
n_samples, n_features = X_vec.shape
n = (n_samples*9)//10
X_train, y_train, X_valid, y_valid = X_vec[:n,:], y[:n], X_vec[n:,:], y[n:]
clf = LogisticRegression(tol=5e-2,C=6.0,fit_intercept=False, class_weight={1 : np.sum(y_train)/n, 0 : 1 - np.sum(y_train)/n}).fit(X_train, y_train)
y_pred = clf.predict(X_valid)
print(np.sum(y_pred == y_valid)/(n_samples - n))

0.773755656109


In [96]:
def val2(X, Y, w, C):
    n = X.shape[0]
    for i in range(n):
        if(Y[i] == 0.):
            Y[i] = -1.
    Y = Y.reshape(n,1)
    exp_vect = np.exp(-Y*(X.dot(w))).reshape(n,1)
    return C*np.sum(np.log(1. + exp_vect))/n + np.sum(w**2)/2.

def grad2(X, Y, w, C):
    n,p = X.shape
    for i in range(n):
        if(Y[i] == 0.):
            Y[i] = -1.
    Y = Y.reshape(n,1)
    exp_vect = np.exp(-Y*(X.dot(w))).reshape(n,1)
    coefs = (-Y*exp_vect/(1. + exp_vect))
    return C*((X.T).dot(coefs)).reshape(p,1)/n + w

#print(val2(X_train, y_train, clf.coef_.reshape(clf.coef_.shape[1],1), 6.))
#print(val2(X_train, y_train, np.zeros((X_train.shape[1],1)), 6.))

print(np.linalg.norm(grad2(X_train, y_train, clf.coef_.reshape(clf.coef_.shape[1],1), 6.)))
print(np.linalg.norm(clf.coef_))
# 33631.6955089 31047.5800603 avec plus de tolérance...
# 16523.2424902

31.8753208025
31.9045852582


In [101]:
n,p = X_train.shape
w = np.zeros((p,1))
C = 6.
cur_grad = grad2(X_train, y_train, w, C)
norm_grad = np.linalg.norm(cur_grad)

eps = 7e-1
alpha = 0.002
cmp = 0
print(cur_grad[:10])
while(norm_grad > eps):
    cmp += 1
    w -= alpha*cur_grad
    cur_grad = grad2(X_train, y_train, w, C)
    norm_grad = np.linalg.norm(cur_grad)
print(cmp)

[[ 0.00151019]
 [ 0.0007551 ]
 [ 0.00151019]
 [ 0.00226529]
 [ 0.        ]
 [ 0.        ]
 [ 0.0007551 ]
 [ 0.        ]
 [ 0.0007551 ]
 [ 0.0007551 ]]
45


In [82]:
print(np.linalg.norm(grad2(X_train, y_train, w, 6.)))
print(np.linalg.norm(w))

0.699968029232
0.0677528819519


In [83]:
print(np.sum(((1./(1. + np.exp(-(X_valid.dot(w))))) > 0.5) == y_valid.reshape(y_valid.shape[0],1))/y_valid.shape[0])

0.762443438914
