In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("input/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [43]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from tqdm import tqdm

def pos_phrase(phrase):
    words = word_tokenize(phrase)
    word_pos = [word + "_" + pos for word, pos in pos_tag(words)]

    return " ".join(word_pos)

tqdm.pandas(desc="pos tagging...")
train["Phrase"].progress_apply(pos_phrase)

In [60]:
def find_wordnet_pos(pos):
    if "V" in pos:
        return "v"
    else:
        return "n"
    
print(find_wordnet_pos("VB"))
print(find_wordnet_pos("NN"))
print(find_wordnet_pos("RB"))

v
n
n


In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_phrase(phrase):
    words = word_tokenize(phrase)
    words_pos = pos_tag(words)

    lemmatized_words = []

    for word, pos in words_pos:
        lemmatized_word = lemmatizer.lemmatize(word, pos=find_wordnet_pos(pos))
        lemmatized_words.append(lemmatized_word)
        
    return " ".join(lemmatized_words)

tqdm.pandas(desc="pos tagging + lemmatizing...")
train["Phrase"].progress_apply(lemmatize_phrase)


pos tagging + lemmatizing...:   0%|          | 0/156060 [00:00<?, ?it/s][A
pos tagging + lemmatizing...:   0%|          | 138/156060 [00:00<01:53, 1379.14it/s][A
pos tagging + lemmatizing...:   0%|          | 292/156060 [00:00<01:49, 1423.63it/s][A
pos tagging + lemmatizing...:   0%|          | 420/156060 [00:00<01:53, 1376.43it/s][A
pos tagging + lemmatizing...:   0%|          | 567/156060 [00:00<01:51, 1400.30it/s][A
pos tagging + lemmatizing...:   0%|          | 695/156060 [00:00<01:54, 1360.11it/s][A
pos tagging + lemmatizing...:   1%|          | 809/156060 [00:00<02:06, 1229.17it/s][A
pos tagging + lemmatizing...:   1%|          | 920/156060 [00:00<02:10, 1189.54it/s][A
pos tagging + lemmatizing...:   1%|          | 1035/156060 [00:00<02:11, 1175.59it/s][A
pos tagging + lemmatizing...:   1%|          | 1182/156060 [00:00<02:03, 1249.45it/s][A
pos tagging + lemmatizing...:   1%|          | 1304/156060 [00:01<02:12, 1167.49it/s][A
pos tagging + lemmatizing...:   1%|     

pos tagging + lemmatizing...:  13%|█▎        | 19836/156060 [00:19<02:17, 988.91it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 19940/156060 [00:20<02:15, 1002.32it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20042/156060 [00:20<02:25, 932.87it/s] [A
pos tagging + lemmatizing...:  13%|█▎        | 20143/156060 [00:20<02:22, 952.10it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20240/156060 [00:20<02:22, 954.40it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20345/156060 [00:20<02:18, 980.62it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20461/156060 [00:20<02:11, 1028.16it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20571/156060 [00:20<02:09, 1048.14it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20677/156060 [00:20<02:13, 1017.78it/s][A
pos tagging + lemmatizing...:  13%|█▎        | 20780/156060 [00:20<02:32, 886.36it/s] [A
pos tagging + lemmatizing...:  13%|█▎        | 20876/156060 [00:21<02:29, 903.92it/s][A
pos tagging + l

pos tagging + lemmatizing...:  25%|██▌       | 39673/156060 [00:39<01:58, 983.95it/s][A
pos tagging + lemmatizing...:  25%|██▌       | 39773/156060 [00:39<01:59, 975.41it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 39878/156060 [00:39<01:56, 995.38it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 39979/156060 [00:39<01:56, 996.01it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40080/156060 [00:39<01:56, 997.25it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40181/156060 [00:40<02:03, 940.58it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40276/156060 [00:40<02:05, 925.47it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40372/156060 [00:40<02:04, 929.33it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40466/156060 [00:40<02:04, 931.57it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40570/156060 [00:40<02:00, 961.37it/s][A
pos tagging + lemmatizing...:  26%|██▌       | 40676/156060 [00:40<01:56, 987.59it/s][A
pos tagging + lemmati

pos tagging + lemmatizing...:  39%|███▊      | 60170/156060 [00:58<01:27, 1091.33it/s][A
pos tagging + lemmatizing...:  39%|███▊      | 60293/156060 [00:59<01:24, 1129.51it/s][A
pos tagging + lemmatizing...:  39%|███▊      | 60407/156060 [00:59<01:25, 1118.57it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 60520/156060 [00:59<01:28, 1080.63it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 60629/156060 [00:59<01:31, 1047.57it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 60735/156060 [00:59<01:32, 1034.43it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 60839/156060 [00:59<01:34, 1002.78it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 60947/156060 [00:59<01:32, 1023.71it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 61062/156060 [00:59<01:29, 1058.29it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 61176/156060 [00:59<01:27, 1080.45it/s][A
pos tagging + lemmatizing...:  39%|███▉      | 61288/156060 [01:00<01:26, 1090.88it/s][A
pos taggin

pos tagging + lemmatizing...:  52%|█████▏    | 80459/156060 [01:18<01:12, 1046.63it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 80565/156060 [01:18<01:16, 981.24it/s] [A
pos tagging + lemmatizing...:  52%|█████▏    | 80665/156060 [01:18<01:16, 982.68it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 80765/156060 [01:18<01:17, 970.09it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 80890/156060 [01:18<01:12, 1034.59it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 80996/156060 [01:18<01:15, 998.24it/s] [A
pos tagging + lemmatizing...:  52%|█████▏    | 81099/156060 [01:18<01:14, 1007.52it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 81212/156060 [01:18<01:11, 1040.69it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 81321/156060 [01:19<01:11, 1050.90it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 81430/156060 [01:19<01:10, 1062.21it/s][A
pos tagging + lemmatizing...:  52%|█████▏    | 81545/156060 [01:19<01:08, 1084.75it/s][A
pos tagging 

pos tagging + lemmatizing...:  64%|██████▍   | 99889/156060 [01:37<00:49, 1133.59it/s][A
pos tagging + lemmatizing...:  64%|██████▍   | 100012/156060 [01:37<00:48, 1160.20it/s][A
pos tagging + lemmatizing...:  64%|██████▍   | 100129/156060 [01:37<00:49, 1133.73it/s][A
pos tagging + lemmatizing...:  64%|██████▍   | 100243/156060 [01:37<00:49, 1127.75it/s][A
pos tagging + lemmatizing...:  64%|██████▍   | 100361/156060 [01:38<00:48, 1142.22it/s][A
pos tagging + lemmatizing...:  64%|██████▍   | 100482/156060 [01:38<00:47, 1161.43it/s][A
pos tagging + lemmatizing...:  64%|██████▍   | 100599/156060 [01:38<00:48, 1150.85it/s][A
pos tagging + lemmatizing...:  65%|██████▍   | 100715/156060 [01:38<00:49, 1114.28it/s][A
pos tagging + lemmatizing...:  65%|██████▍   | 100827/156060 [01:38<00:51, 1082.66it/s][A
pos tagging + lemmatizing...:  65%|██████▍   | 100938/156060 [01:38<00:50, 1089.54it/s][A
pos tagging + lemmatizing...:  65%|██████▍   | 101059/156060 [01:38<00:49, 1121.01it/s][A


pos tagging + lemmatizing...:  77%|███████▋  | 120862/156060 [01:56<00:35, 993.83it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 120972/156060 [01:56<00:34, 1023.11it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121077/156060 [01:56<00:33, 1030.48it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121183/156060 [01:56<00:33, 1038.80it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121294/156060 [01:56<00:32, 1055.50it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121400/156060 [01:57<00:33, 1026.45it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121503/156060 [01:57<00:33, 1016.99it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121614/156060 [01:57<00:33, 1041.38it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121719/156060 [01:57<00:34, 996.70it/s] [A
pos tagging + lemmatizing...:  78%|███████▊  | 121824/156060 [01:57<00:33, 1007.62it/s][A
pos tagging + lemmatizing...:  78%|███████▊  | 121926/156060 [01:57<00:33, 1008.65it/s][A


pos tagging + lemmatizing...:  88%|████████▊ | 137728/156060 [02:16<00:22, 832.72it/s][A
pos tagging + lemmatizing...:  88%|████████▊ | 137818/156060 [02:16<00:21, 851.41it/s][A
pos tagging + lemmatizing...:  88%|████████▊ | 137910/156060 [02:16<00:20, 870.06it/s][A
pos tagging + lemmatizing...:  88%|████████▊ | 137998/156060 [02:16<00:21, 852.79it/s][A
pos tagging + lemmatizing...:  88%|████████▊ | 138084/156060 [02:16<00:21, 835.55it/s][A
pos tagging + lemmatizing...:  89%|████████▊ | 138168/156060 [02:16<00:21, 829.30it/s][A
pos tagging + lemmatizing...:  89%|████████▊ | 138252/156060 [02:16<00:24, 729.37it/s][A
pos tagging + lemmatizing...:  89%|████████▊ | 138328/156060 [02:17<00:26, 671.71it/s][A
pos tagging + lemmatizing...:  89%|████████▊ | 138413/156060 [02:17<00:24, 715.83it/s][A
pos tagging + lemmatizing...:  89%|████████▊ | 138488/156060 [02:17<00:24, 711.90it/s][A
pos tagging + lemmatizing...:  89%|████████▉ | 138562/156060 [02:17<00:29, 588.38it/s][A
pos taggin

In [3]:
test = pd.read_csv("input/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(Origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(Origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
def clean_text(phrase):
    phrase = phrase.replace("ca n't", "can not")
    phrase = phrase.replace("does n't", "does not")
    phrase = phrase.replace("n't", "not")

    return phrase

In [7]:
train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [8]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Vectorize Phrases

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 2))
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
vectorizer.fit(train["Phrase"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 1152072 stored elements in Compressed Sparse Row format>

In [12]:
vocabulary = vectorizer.get_feature_names()
pd.DataFrame(X_train[0:100].toarray(), columns=vocabulary).head()

Unnamed: 0,000,10,10 minutes,10 year,100,101,11,12,12 year,13,...,your watch,yourself,youth,youthful,zany,zeal,zero,zhang,zombie,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X_test = vectorizer.transform(test["Phrase"])
X_test

<66292x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 421280 stored elements in Compressed Sparse Row format>

In [14]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

### Train

In [15]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### Scoring

In [16]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=train["SentenceId"])

print(y_predict.shape)
y_predict



(156060,)


array([3, 3, 2, ..., 2, 2, 2])

In [17]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)

print("Score = {0:0.6f}".format(score))

Score = 0.583602


In [18]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict

result["Distance"] = result["Sentiment"] - result["Sentiment(predict)"]
result["Distance"] = np.abs(result["Distance"])

result = result.sort_values(by="Distance", ascending=False)
result.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Origin),Sentiment(predict),Distance
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30291,1407,"Like a skillful fisher , the director uses the...",4,"Like a skillful fisher , the director uses the...",0,4
41990,2021,To build a feel-good fantasy around a vain dic...,0,To build a feel-good fantasy around a vain dic...,4,4
82697,4270,This is one of the biggest disappointments of ...,0,This is one of the biggest disappointments of ...,4,4
82698,4270,is one of the biggest disappointments of the y...,0,is one of the biggest disappointments of the y...,4,4
82699,4270,is one of the biggest disappointments of the year,0,is one of the biggest disappointments of the year,4,4


In [19]:
# result[result["Phrase"].str.contains("can't")]

In [20]:
result[0:10000].to_csv("result.csv")

In [21]:
pd.DataFrame(vocabulary).to_csv("vocabulary.csv")

### Predict

In [22]:
model.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [23]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions

(66292,)


array([3, 3, 2, ..., 2, 2, 2])

## Submit

In [24]:
submission = pd.read_csv("input/sampleSubmission.csv", index_col="PhraseId")

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [25]:
submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,2
156064,3
156065,3


In [26]:
submission.to_csv("baseline-script.csv")