In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [107]:
train["Phrase(Origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

In [109]:
test["Phrase(Origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

### Clean Text

In [134]:
def clean_text(phrase):
    phrase = phrase.replace("ca n't", "can not")
    phrase = phrase.replace("does n't", "does not")
    phrase = phrase.replace("n't", "not")
    phrase = phrase.replace("-", " ")
    phrase = phrase.replace("ed ", " ")
    phrase = phrase.replace("ing ", " ")
    phrase = phrase.replace("s ", " ")

    return phrase

In [135]:
train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [136]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Vectorize Phrases

In [137]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 2))
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [138]:
vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [139]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 1152072 stored elements in Compressed Sparse Row format>

In [140]:
vocabulary = vectorizer.get_feature_names()
pd.DataFrame(X_train[0:100].toarray(), columns=vocabulary).head()

Unnamed: 0,000,10,10 minutes,10 year,100,101,11,12,12 year,13,...,your watch,yourself,youth,youthful,zany,zeal,zero,zhang,zombie,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [141]:
X_test = vectorizer.transform(test["Phrase"])
X_test

<66292x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 421280 stored elements in Compressed Sparse Row format>

In [142]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

### Train

In [143]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### Scoring

In [154]:
from sklearn.model_selection import cross_val_predict

y_predict = cross_val_predict(model, X_train, y_train, cv=5)

print(y_predict.shape)
y_predict

In [145]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)

print("Score = {0:0.6f}".format(score))

Score = 0.576214


In [146]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict

result["Distance"] = result["Sentiment"] - result["Sentiment(predict)"]
result["Distance"] = np.abs(result["Distance"])

result = result.sort_values(by="Distance", ascending=False)
result.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Origin),Sentiment(predict),Distance
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49182,2404,It would be disingenuous to call Reno a great ...,0,It would be disingenuous to call Reno a great ...,4,4
15921,683,has made literature literal without killing it...,4,has made literature literal without killing it...,0,4
15904,683,"The year 's greatest adventure , and Jackson '...",4,"The year 's greatest adventure , and Jackson '...",0,4
110715,5866,` In this poor remake of such a well loved cla...,0,` In this poor remake of such a well loved cla...,4,4
77871,4006,"Think The Lion King redone for horses , with f...",0,"Think The Lion King redone for horses , with f...",4,4


In [147]:
# result[result["Phrase"].str.contains("can't")]

In [148]:
result[0:10000].to_csv("result.csv")

In [36]:
pd.DataFrame(vocabulary).to_csv("vocabulary.csv")

### Predict

In [149]:
model.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [150]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions

(66292,)


array([3, 3, 2, ..., 2, 2, 2])

## Submit

In [151]:
submission = pd.read_csv("sampleSubmission.csv", index_col="PhraseId")

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [152]:
submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [153]:
submission.to_csv("baseline-script.csv")