In [1]:
import pandas as pd

### Load Dataset

In [2]:
train = pd.read_csv("input/train.tsv", sep="\t", index_col="PhraseId")

print(train.shape)
train.head()

(156060, 3)


Unnamed: 0_level_0,SentenceId,Phrase,Sentiment
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,A series of escapades demonstrating the adage ...,1
2,1,A series of escapades demonstrating the adage ...,2
3,1,A series,2
4,1,A,2
5,1,series,2


In [3]:
test = pd.read_csv("input/test.tsv", sep="\t", index_col="PhraseId")

print(test.shape)
test.head()

(66292, 2)


Unnamed: 0_level_0,SentenceId,Phrase
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,8545,An intermittently pleasing but mostly routine ...
156062,8545,An intermittently pleasing but mostly routine ...
156063,8545,An
156064,8545,intermittently pleasing but mostly routine effort
156065,8545,intermittently pleasing but mostly routine


## Preprocessing

In [4]:
train["Phrase(origin)"] = train["Phrase"].copy()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [5]:
test["Phrase(origin)"] = test["Phrase"].copy()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Clean Text

In [6]:
def clean_text(phrase):
    phrase = phrase.replace("doesn't ", "does not ")
    phrase = phrase.replace("ca n't ", "can not ")
    phrase = phrase.replace(" n't ", " not ")

    return phrase

train["Phrase"] = train["Phrase"].apply(clean_text)

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

(156060, 4)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
2,A series of escapades demonstrating the adage ...,A series of escapades demonstrating the adage ...
3,A series,A series
4,A,A
5,series,series


In [None]:
test["Phrase"] = test["Phrase"].apply(clean_text)

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

(66292, 3)


Unnamed: 0_level_0,Phrase,Phrase(origin)
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1
156061,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156062,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
156063,An,An
156064,intermittently pleasing but mostly routine effort,intermittently pleasing but mostly routine effort
156065,intermittently pleasing but mostly routine,intermittently pleasing but mostly routine


### Stem && Lemmatize phrases

In [None]:
from tqdm import tqdm
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def stem_phrase(phrase):
    stemmed_words = [stemmer.stem(w) for w in phrase.split(" ")]
    stemmed_phrase = " ".join(stemmed_words)

    return stemmed_phrase

def firstCharLowerPoS(tag):
    resultTag = tag[0]
    resultTag = resultTag.lower()
    
    return resultTag

def postag_lemm_or_stem_phrase(phrase):
    words = []
    posList = ['n', 'v', 'a', 'r']
    for word, tag in pos_tag(word_tokenize(phrase)):
        if firstCharLowerPoS(tag) in posList:
            words.append(lemmatizer.lemmatize(word, pos=firstCharLowerPoS(tag)))
        else:
            words.append(stemmer.stem(word))

    return_phrase = " ".join(words)
   
    return return_phrase

tqdm.pandas(desc="Stemming...")
train["Phrase"].progress_apply(postag_lemm_or_stem_phrase).head()

print(train.shape)
train[["Phrase", "Phrase(origin)"]].head()

Stemming...:  30%|██▉       | 46501/156060 [00:37<01:25, 1285.21it/s]

In [None]:
tqdm.pandas(desc="Stemming...")
test["Phrase"].progress_apply(postag_lemm_or_stem_phrase).head()

print(test.shape)
test[["Phrase", "Phrase(origin)"]].head()

### Vectorize phrases

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(analyzer='char', max_features=10000, ngram_range=(1, 9))
char_vectorizer

In [None]:
char_vectorizer.fit(train["Phrase"])

In [None]:
X_train_char = char_vectorizer.transform(train["Phrase"])
X_train_char

In [None]:
X_test_char = char_vectorizer.transform(test["Phrase"])
X_test_char

In [None]:
word_vectorizer = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))
word_vectorizer

In [None]:
word_vectorizer.fit(train["Phrase"])

In [None]:
X_train_word = word_vectorizer.transform(train["Phrase"])
X_train_word

In [None]:
X_test_word = word_vectorizer.transform(test["Phrase"])
X_test_word

In [None]:
from scipy.sparse import hstack

X_train = hstack([X_train_char, X_train_word])
X_train

In [None]:
X_test = hstack([X_test_char, X_test_word])
X_test

In [None]:
# columns = word_vectorizer.get_feature_names()
# pd.DataFrame(X_train.tocsr()[:100].toarray(), columns=columns).head()

In [None]:
y_train = train["Sentiment"]

print(y_train.shape)
y_train.head()

In [None]:
sentence_ids = train["SentenceId"]

print(sentence_ids.shape)
sentence_ids.head()

## Score

In [None]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(alpha=0.000006762746, random_state=37)
model

In [None]:
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GroupKFold

kfold = GroupKFold(n_splits=5)

y_predict = cross_val_predict(model, X_train, y_train,
                              cv=kfold, groups=sentence_ids)

print(y_predict.shape)
y_predict[0:10]

In [None]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_train, y_predict)
print("Score = {0:.5f}".format(score))

In [None]:
import numpy as np

result = train.copy()
result["Sentiment(predict)"] = y_predict
result["Difference(Phrase)"] = np.abs(y_train - y_predict)

print(result.shape)
result.head()

In [None]:
sentiment = result.groupby("SentenceId")["Difference(Phrase)"].mean()
print(sentiment.shape)
sentiment.head()

In [None]:
def find_sentiment(sentence_id):
    return sentiment.loc[sentence_id]

result["Difference(Sentence)"] = result["SentenceId"].apply(find_sentiment)
result = result.sort_values(by="Difference(Sentence)", ascending=False)

print(result.shape)
result.head()

In [None]:
result[0:1000].to_csv("result.csv")

In [None]:
# vocabulary = vectorizer.get_feature_names()
# vocabulary[0:3]

In [None]:
# pd.DataFrame(vocabulary, columns=["word"]).to_csv("vocabulary.csv")

In [None]:
# result[result["Phrase"].str.contains("can not recommend")]

## Train

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)

params = {
    'booster': 'gblinear',
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'lambda': 2.186753e-03,
    'alpha': 1.286904,
    'lambda_bias': 6.191707e+00,
    'num_class': 5,
    'nthread': 8,
    'silent': 1,
}

%time booster = xgb.train(params, dtrain, num_boost_round=98)

In [None]:
dtest = xgb.DMatrix(X_test.toarray())

predictions = booster.predict(dtest)

print(predictions.shape)
predictions[0:10]

## Submit

In [None]:
submission = pd.read_csv("input/sampleSubmission.csv", index_col="PhraseId")

submission["Sentiment"] = predictions.astype('int')

print(submission.shape)
submission.head()

In [None]:
# 경로(ex: baseline-script.csv)는 사용자 설정마다 다름
submission.to_csv("use-xgboost.csv")