In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("input/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from tqdm import tqdm

def pos_phrase(phrase):
    words = word_tokenize(phrase)
    word_pos = [word + "_" + pos for word, pos in pos_tag(words)]

    return " ".join(word_pos)

tqdm.pandas(desc="pos tagging...")
train["Phrase"].progress_apply(pos_phrase)

pos tagging...: 100%|██████████| 156060/156060 [01:40<00:00, 1554.66it/s]


PhraseId
1         A_DT series_NN of_IN escapades_NNS demonstrati...
2         A_DT series_NN of_IN escapades_NNS demonstrati...
3                                            A_DT series_NN
4                                                      A_DT
5                                                 series_NN
6         of_IN escapades_NNS demonstrating_VBG the_DT a...
7                                                     of_IN
8         escapades_NNS demonstrating_VBG the_DT adage_N...
9                                             escapades_NNS
10        demonstrating_VBG the_DT adage_NN that_IN what...
11                        demonstrating_VBG the_DT adage_NN
12                                        demonstrating_VBG
13                                          the_DT adage_NN
14                                                   the_DT
15                                                 adage_NN
16        that_DT what_WP is_VBZ good_JJ for_IN the_DT g...
17                             

In [4]:
def find_wordnet_pos(pos):
    if "V" in pos:
        return "v"
    else:
        return "n"
    
print(find_wordnet_pos("VB"))
print(find_wordnet_pos("NN"))
print(find_wordnet_pos("RB"))

v
n
n


In [5]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_phrase(phrase):
    words = word_tokenize(phrase)
    words_pos = pos_tag(words)

    lemmatized_words = []

    for word, pos in words_pos:
        lemmatized_word = lemmatizer.lemmatize(word, pos=find_wordnet_pos(pos))
        lemmatized_words.append(lemmatized_word)
        
    return " ".join(lemmatized_words)

tqdm.pandas(desc="pos tagging + lemmatizing...")
train["Phrase"].progress_apply(lemmatize_phrase)

pos tagging + lemmatizing...: 100%|██████████| 156060/156060 [01:53<00:00, 1371.46it/s]


PhraseId
1         A series of escapade demonstrate the adage tha...
2         A series of escapade demonstrate the adage tha...
3                                                  A series
4                                                         A
5                                                    series
6         of escapade demonstrate the adage that what be...
7                                                        of
8         escapade demonstrate the adage that what be go...
9                                                  escapade
10        demonstrate the adage that what be good for th...
11                                    demonstrate the adage
12                                              demonstrate
13                                                the adage
14                                                      the
15                                                    adage
16                          that what be good for the goose
17                             

In [6]:
test = pd.read_csv("input/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [7]:
train["Phrase(Origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [8]:
test["Phrase(Origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [9]:
def clean_text(phrase):
    phrase = phrase.replace("ca n't", "can not")
    phrase = phrase.replace("does n't", "does not")
    phrase = phrase.replace("n't", "not")

    return phrase

In [10]:
train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(Origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [11]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(Origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(Origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Vectorize Phrases

In [12]:
# from sklearn.neighbors import NearestNeighbors

# nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))
#vectorizer = CountVectorizer(max_features=10000, ngram_range=(1, 2))
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
vectorizer.fit(train["Phrase"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=30000, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
X_train = vectorizer.transform(train["Phrase"])
X_train

<156060x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 1443974 stored elements in Compressed Sparse Row format>

In [16]:
vocabulary = vectorizer.get_feature_names()
pd.DataFrame(X_train[0:100].toarray(), columns=vocabulary).head()

Unnamed: 0,000,10,10 000,10 minutes,10 or,10 year,10 years,100,100 minute,100 minutes,...,zippy,zishe,zombie,zombie you,zone,zone and,zone armed,zone episode,zucker,zucker brothers
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X_test = vectorizer.transform(test["Phrase"])
X_test

<66292x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 469259 stored elements in Compressed Sparse Row format>

In [18]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

(156060,)


PhraseId
1    1
2    2
3    2
4    2
5    2
Name: Sentiment, dtype: int64

### Train

In [19]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=37)
model

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### Scoring

In [20]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=train["SentenceId"])

print(y_predict.shape)
y_predict



(156060,)


array([3, 2, 2, ..., 2, 2, 2])

In [21]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)

print("Score = {0:0.6f}".format(score))

Score = 0.548065


In [22]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict

result["Distance"] = result["Sentiment"] - result["Sentiment(predict)"]
result["Distance"] = np.abs(result["Distance"])

result = result.sort_values(by="Distance", ascending=False)
result.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,Phrase(Origin),Sentiment(predict),Distance
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8171,338,of the most highly-praised disappointments I,0,of the most highly-praised disappointments I,4,4
74654,3828,be one of the most appealing movies ever made ...,4,be one of the most appealing movies ever made ...,0,4
80728,4158,simply can not recommend it enough .,4,simply ca n't recommend it enough .,0,4
89393,4648,One of those rare films that seems as though i...,4,One of those rare films that seems as though i...,0,4
99219,5203,"The issues are presented in such a lousy way ,...",0,"The issues are presented in such a lousy way ,...",4,4


In [23]:
# result[result["Phrase"].str.contains("can't")]

In [24]:
result[0:10000].to_csv("result.csv")

In [25]:
pd.DataFrame(vocabulary).to_csv("vocabulary.csv")

### Predict

In [26]:
model.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=37, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [27]:
predictions = model.predict(X_test)

print(predictions.shape)
predictions

(66292,)


array([3, 3, 3, ..., 2, 2, 2])

## Submit

In [28]:
submission = pd.read_csv("input/sampleSubmission.csv", index_col="PhraseId")

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,2
156062,2
156063,2
156064,2
156065,2


In [29]:
submission["Sentiment"] = predictions

print(submission.shape)
submission.head()

(66292, 1)


Unnamed: 0_level_0,Sentiment
PhraseId,Unnamed: 1_level_1
156061,3
156062,3
156063,3
156064,2
156065,2


In [30]:
submission.to_csv("baseline-script.csv")