

Author: Aman Pandey



Data Discription:

The train.csv has three columns: id, text, and Target. text column has Tweets and Target column has sentiments (-1,0,1). In the Target column, values of -1,1,0 correspond to negative, positive, and neutral sentiment

In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import spacy
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.porter import PorterStemmer

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
#from TokenizationTest import TokenizationTest
from happyfuntokenizing import Tokenizer as potts
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import preprocessing
from sklearn import linear_model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [55]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from nltk.stem.snowball import SnowballStemmer


# Data Preprocessing

In [4]:
# spaCy lemmatization needs tagger but disable the rest
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f0306f7b708>)

In [5]:
def get_part_of_speech_tags(token):
    
    """Maps POS tags to first character lemmatize() accepts.
    We are focussing on Verbs, Nouns, Adjectives and Adverbs here."""

    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    tag = nltk.pos_tag([token])[0][1][0].upper()
    
    return tag_dict.get(tag, wordnet.NOUN)

In [56]:
def whitespace_tokenizer(data):
  return str.split(data)

def potter_tokenizer(data):
  tokenizer= potts()
  return tokenizer.tokenize(data)

def spacy_tokenizer(data):
  spacy_tokens=nlp(data)
  return [token.text for token in spacy_tokens]

def spacy_lemmatizer(data):
  spacy_tokens=nlp(data)
  return [token.lemma_ for token in spacy_tokens]

def nltk_treebank_tokenizer(data):
  tokenizer= TreebankWordTokenizer()
  return tokenizer.tokenize(data)

def nltk_treebank_stemmer_tokenizer(data):
  tokenizer= TreebankWordTokenizer()
  stemmer = PorterStemmer()
  tokens= tokenizer.tokenize(data)
  return [stemmer.stem(token) for token in tokens]

def nltk_treebank_lemmatizer_tokenizer(data):
  tokenizer= TreebankWordTokenizer()
  lemmatizer = WordNetLemmatizer()  
  tokens= tokenizer.tokenize(data)
  return [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in tokens]

def nltk_TweetTokenizer_stemmer(data):
  tokenizer = TweetTokenizer()
  stemmer = PorterStemmer()
  tokens= tokenizer.tokenize(data)
  return [stemmer.stem(token) for token in tokens]

def nltk_TweetTokenizer_SnowBallstemmer(data):
  tokenizer = TweetTokenizer()
  stemmer = SnowballStemmer(language='english')
  tokens= tokenizer.tokenize(data)
  return [stemmer.stem(token) for token in tokens] 

def potter_tokenizer_stemmer(data):
  tokenizer= potts()
  stemmer = PorterStemmer()
  tokens= tokenizer.tokenize(data)
  return [stemmer.stem(token) for token in tokens]

def potter_tokenizer_lemmatizer(data):
  tokenizer= potts()
  lemmatizer = WordNetLemmatizer()  
  tokens= tokenizer.tokenize(data)
  return [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in tokens]

def nltk_TweetTokenizer_lemmatizer_tokenizer(data):
  tokenizer = TweetTokenizer()
  lemmatizer = WordNetLemmatizer()
  tokens= tokenizer.tokenize(data)
  return [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in tokens] 



In [7]:

import re
import pandas as pd
dat = pd.read_csv('/content/train.csv', encoding='latin1')
dat.head()


Unnamed: 0,Id,text,Target
0,1,@USAirways ! THE WORST in customer service. @...,-1
1,2,@united call wait times are over 20 minutes an...,-1
2,3,@JetBlue what's up with the random delay on fl...,-1
3,4,@AmericanAir Good morning! Wondering why my p...,0
4,5,@united UA 746. Pacific Rim and Date Night cut...,-1


In [8]:
print(dat['text'][35])
dat['text']= dat.text.str.replace('\n','')
print(dat['text'][35])

@united
You really know how to piss people off. Your Farelock option is fake!
@unitedYou really know how to piss people off. Your Farelock option is fake!


In [9]:
dat['Target'].value_counts()

-1    4566
 0    1536
 1    1218
Name: Target, dtype: int64

In [9]:
	def read_data(filename, tokenizer):
		corpus=[]
		Y=[]
		for i in range(len(dat)):
		#id=dat['Id']
			text=dat['text'][i]
		#print(cols)
			label= dat['Target'][i]
			tokens=' '.join(tokenizer(text))
			corpus.append(tokens)
			Y.append(label)
				
		
		return corpus, Y

In [None]:
dat.shape

(7319, 3)

In [10]:
trainFile =  dat.iloc[:5000,]
testFile =  dat.iloc[5000:,]

In [11]:
trainFile.shape, testFile.shape

((5000, 3), (2320, 3))

# Basic Model

## Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
	def evaluate_tokenizer1(tokenizer):
		
		train_corpus, train_labels=read_data(trainFile, tokenizer)
		test_corpus, test_labels=read_data(testFile, tokenizer)
		
		vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
		X_train = vectorizer.fit_transform(train_corpus)
		X_test = vectorizer.transform(test_corpus)

		#le = preprocessing.LabelEncoder()
		#le.fit(train_labels)

		Y_train=train_labels
		Y_test=test_labels
		param_lr = {'C': [1,2,3,10,100],'penalty':['l1', 'l2']}

		grid_PLR = GridSearchCV(LogisticRegression(max_iter=10000, solver='liblinear'), param_lr, cv=5,
                          return_train_score=True, scoring='f1_macro')
		grid_PLR.fit(X_train, Y_train)
		print('Tokenizer', tokenizer.__name__)
		print('train score: ', grid_PLR.score(X_train, Y_train))
		print('test score: ', grid_PLR.score(X_test, Y_test))
	
		print("Best parameters: {}".format(grid_PLR.best_params_))
		print("Best cross-validation score: {:.4f}".format(grid_PLR.best_score_))
		


In [None]:
tokenizers= [whitespace_tokenizer,potter_tokenizer, spacy_tokenizer, nltk_treebank_tokenizer, nltk_treebank_stemmer_tokenizer, nltk_treebank_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer1(tokenizer))

Tokenizer whitespace_tokenizer
train score:  0.9380055953936335
test score:  0.9380055953936335
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7053
None
Tokenizer spacy_tokenizer
train score:  0.9378440289976937
test score:  0.9378440289976937
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7325
None
Tokenizer nltk_treebank_tokenizer
train score:  0.9347389845873417
test score:  0.9347389845873417
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7303
None
Tokenizer nltk_treebank_stemmer_tokenizer
train score:  0.9311519334529113
test score:  0.9311519334529113
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7448
None
Tokenizer nltk_treebank_lemmatizer_tokenizer
train score:  0.9311880373268272
test score:  0.9311880373268272
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7370
None


In [36]:
def evaluate_tokenizer1(tokenizer):
  
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels
  param_lr = {'C': [1,2,3,10,100],'penalty':['l1', 'l2']}

  grid_PLR = GridSearchCV(LogisticRegression(max_iter=10000, solver='liblinear'), param_lr, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_PLR.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_PLR.score(X_train, Y_train))
  print('test score: ', grid_PLR.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_PLR.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_PLR.best_score_))
		


In [41]:
tokenizers= [nltk_TweetTokenizer_stemmer, nltk_TweetTokenizer_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer1(tokenizer))

Tokenizer nltk_TweetTokenizer_stemmer
train score:  0.8568420023056212
test score:  0.8568420023056212
Best parameters: {'C': 1, 'penalty': 'l1'}
Best cross-validation score: 0.7448
None
Tokenizer nltk_TweetTokenizer_lemmatizer_tokenizer
train score:  0.8541423078201452
test score:  0.8541423078201452
Best parameters: {'C': 1, 'penalty': 'l1'}
Best cross-validation score: 0.7354
None


### Logistic Regression with Stop Words

In [46]:
def evaluate_tokenizer5(tokenizer):
  
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, stop_words="english", lowercase=False, strip_accents=None, binary=False)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels
  param_lr = {'C': [1,2,3,10,100],'penalty':['l1', 'l2']}

  grid_PLR = GridSearchCV(LogisticRegression(max_iter=10000, solver='liblinear'), param_lr, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_PLR.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_PLR.score(X_train, Y_train))
  print('test score: ', grid_PLR.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_PLR.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_PLR.best_score_))
		


In [47]:
tokenizers= [nltk_treebank_lemmatizer_tokenizer, nltk_TweetTokenizer_stemmer, nltk_TweetTokenizer_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer5(tokenizer))

Tokenizer nltk_treebank_lemmatizer_tokenizer
train score:  0.9334895168644403
test score:  0.9334895168644403
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7306
None
Tokenizer nltk_TweetTokenizer_stemmer
train score:  0.8585291591423031
test score:  0.8585291591423031
Best parameters: {'C': 1, 'penalty': 'l1'}
Best cross-validation score: 0.7403
None
Tokenizer nltk_TweetTokenizer_lemmatizer_tokenizer
train score:  0.9365982825163225
test score:  0.9365982825163225
Best parameters: {'C': 1, 'penalty': 'l2'}
Best cross-validation score: 0.7342
None


### Logistic Regression with TfidVectorizer

In [48]:
def evaluate_tokenizer6(tokenizer):
  
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels
  param_lr = {'C': [1,2,3,10,100],'penalty':['l1', 'l2']}

  grid_PLR = GridSearchCV(LogisticRegression(max_iter=10000, solver='liblinear'), param_lr, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_PLR.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_PLR.score(X_train, Y_train))
  print('test score: ', grid_PLR.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_PLR.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_PLR.best_score_))
		


In [49]:
tokenizers= [potter_tokenizer, nltk_treebank_lemmatizer_tokenizer, nltk_TweetTokenizer_stemmer, nltk_TweetTokenizer_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer6(tokenizer))

Tokenizer potter_tokenizer
train score:  0.8622186954224386
test score:  0.8622186954224386
Best parameters: {'C': 3, 'penalty': 'l1'}
Best cross-validation score: 0.7159
None
Tokenizer nltk_treebank_lemmatizer_tokenizer
train score:  0.8603404304414314
test score:  0.8603404304414314
Best parameters: {'C': 3, 'penalty': 'l1'}
Best cross-validation score: 0.7259
None
Tokenizer nltk_TweetTokenizer_stemmer
train score:  0.8219731340606312
test score:  0.8219731340606312
Best parameters: {'C': 2, 'penalty': 'l1'}
Best cross-validation score: 0.7250
None
Tokenizer nltk_TweetTokenizer_lemmatizer_tokenizer
train score:  0.8609480756265535
test score:  0.8609480756265535
Best parameters: {'C': 3, 'penalty': 'l1'}
Best cross-validation score: 0.7275
None


### Logistic Regression with Hashing Vectorizer

In [51]:
def evaluate_tokenizer7(tokenizer):
  
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = HashingVectorizer()
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels
  param_lr = {'C': [1,2,3,10,100],'penalty':['l1', 'l2']}

  grid_PLR = GridSearchCV(LogisticRegression(max_iter=10000, solver='liblinear'), param_lr, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_PLR.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_PLR.score(X_train, Y_train))
  print('test score: ', grid_PLR.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_PLR.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_PLR.best_score_))
		


In [52]:
tokenizers= [potter_tokenizer, nltk_treebank_lemmatizer_tokenizer, nltk_TweetTokenizer_stemmer, nltk_TweetTokenizer_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer7(tokenizer))

Tokenizer potter_tokenizer
train score:  0.9067264682667906
test score:  0.9067264682667906
Best parameters: {'C': 10, 'penalty': 'l2'}
Best cross-validation score: 0.7228
None
Tokenizer nltk_treebank_lemmatizer_tokenizer
train score:  0.8967545109022157
test score:  0.8967545109022157
Best parameters: {'C': 10, 'penalty': 'l2'}
Best cross-validation score: 0.7255
None
Tokenizer nltk_TweetTokenizer_stemmer
train score:  0.8915249104884592
test score:  0.8915249104884592
Best parameters: {'C': 10, 'penalty': 'l2'}
Best cross-validation score: 0.7309
None
Tokenizer nltk_TweetTokenizer_lemmatizer_tokenizer
train score:  0.8973321161193947
test score:  0.8973321161193947
Best parameters: {'C': 10, 'penalty': 'l2'}
Best cross-validation score: 0.7285
None


## Naïve Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


In [None]:
	def evaluate_tokenizer2(tokenizer):
		
		train_corpus, train_labels=read_data(trainFile, tokenizer)
		test_corpus, test_labels=read_data(testFile, tokenizer)
		
		vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
		X_train = vectorizer.fit_transform(train_corpus)
		X_test = vectorizer.transform(test_corpus)

		#le = preprocessing.LabelEncoder()
		#le.fit(train_labels)

		Y_train=train_labels
		Y_test=test_labels
		naive_param = { 'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001) }

		naive_grid = GridSearchCV(MultinomialNB(), naive_param, cv=5,
                          return_train_score=True,n_jobs=-1, scoring='f1_macro')
		naive_grid.fit(X_train, Y_train)
		print('Tokenizer', tokenizer.__name__)
		print('train score: ', naive_grid.score(X_train, Y_train))
		print('test score: ', naive_grid.score(X_test, Y_test))
	
		print("Best parameters: {}".format(naive_grid.best_params_))
		print("Best cross-validation score: {:.4f}".format(naive_grid.best_score_))

In [None]:
tokenizers= [whitespace_tokenizer,potter_tokenizer, spacy_tokenizer, nltk_treebank_tokenizer, nltk_treebank_stemmer_tokenizer, nltk_treebank_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer2(tokenizer))

Tokenizer whitespace_tokenizer
train score:  0.9168729804038124
test score:  0.9168729804038124
Best parameters: {'alpha': 0.1}
Best cross-validation score: 0.6707
None
Tokenizer spacy_tokenizer
train score:  0.9180460050127951
test score:  0.9180460050127951
Best parameters: {'alpha': 0.1}
Best cross-validation score: 0.6941
None
Tokenizer nltk_treebank_tokenizer
train score:  0.9164870809910312
test score:  0.9164870809910312
Best parameters: {'alpha': 0.1}
Best cross-validation score: 0.6888
None
Tokenizer nltk_treebank_stemmer_tokenizer
train score:  0.913222868270705
test score:  0.913222868270705
Best parameters: {'alpha': 0.1}
Best cross-validation score: 0.6985
None
Tokenizer nltk_treebank_lemmatizer_tokenizer
train score:  0.9152943740038655
test score:  0.9152943740038655
Best parameters: {'alpha': 0.1}
Best cross-validation score: 0.6861
None


## XGBoost

In [34]:
from xgboost import XGBClassifier

### XGBoost with different Tokenizers

In [None]:
def evaluate_tokenizer3(tokenizer):
		
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels

  xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')

  xgbc_param = {
            'max_depth' : [6, 8],
            'n_estimators' : [200,300,400],
            'learning_rate' : [ 0.4, 0.6, 0.8],
             'min_child_weight' : [1,3],
              #'subsample':[0.6,0.7,0.8,0.9,1]
            }

  grid_xgbc = GridSearchCV(xgbc, xgbc_param, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_xgbc.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_xgbc.score(X_train, Y_train))
  print('test score: ', grid_xgbc.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_xgbc.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_xgbc.best_score_))

In [None]:
tokenizers= [whitespace_tokenizer,potter_tokenizer, spacy_tokenizer, nltk_treebank_tokenizer, nltk_treebank_stemmer_tokenizer, nltk_treebank_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer3(tokenizer))

Tokenizer whitespace_tokenizer
train score:  0.953841117157232
test score:  0.953841117157232
Best parameters: {'learning_rate': 0.6, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 150}
Best cross-validation score: 0.6992
None
Tokenizer spacy_tokenizer
train score:  0.8994174092347021
test score:  0.8994174092347021
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 150}
Best cross-validation score: 0.7327
None
Tokenizer nltk_treebank_tokenizer
train score:  0.9668800242432911
test score:  0.9668800242432911
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7205
None
Tokenizer nltk_treebank_stemmer_tokenizer
train score:  0.9698399232619833
test score:  0.9698399232619833
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7435
None


In [203]:
def evaluate_tokenizer4(tokenizer):
		
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels

  xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')

  xgbc_param = {
            'max_depth' : [6, 8,10],
            'n_estimators' : [150,200,300],
            'learning_rate' : [.4, 0.6,0.7],
             'min_child_weight' : [1],
              #'subsample':[0.6,0.7,0.8,0.9,1]
            }

  grid_xgbc = GridSearchCV(xgbc, xgbc_param, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_xgbc.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_xgbc.score(X_train, Y_train))
  print('test score: ', grid_xgbc.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_xgbc.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_xgbc.best_score_))

In [204]:
tokenizers= [whitespace_tokenizer,potter_tokenizer, spacy_tokenizer, nltk_treebank_tokenizer, nltk_treebank_stemmer_tokenizer, nltk_treebank_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer4(tokenizer))

Tokenizer whitespace_tokenizer
train score:  0.9437502620021648
test score:  0.9437502620021648
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7035
None
Tokenizer spacy_tokenizer
train score:  0.9628180818649038
test score:  0.9628180818649038
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7328
None
Tokenizer nltk_treebank_tokenizer
train score:  0.9704953868693681
test score:  0.9704953868693681
Best parameters: {'learning_rate': 0.7, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7230
None
Tokenizer nltk_treebank_stemmer_tokenizer
train score:  0.9698399232619833
test score:  0.9698399232619833
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7435
None
Tokenizer nltk_treebank_lemmatizer_tokenize

In [53]:
def evaluate_tokenizer7(tokenizer):
		
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels

  xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')

  xgbc_param = {
            'max_depth' : [6, 8],
            'n_estimators' : [200,300],
            'learning_rate' : [.4, 0.6,0.7],
             'min_child_weight' : [1],
              #'subsample':[0.6,0.7,0.8,0.9,1]
            }

  grid_xgbc = GridSearchCV(xgbc, xgbc_param, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_xgbc.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_xgbc.score(X_train, Y_train))
  print('test score: ', grid_xgbc.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_xgbc.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_xgbc.best_score_))

In [54]:
tokenizers= [potter_tokenizer, nltk_treebank_stemmer_tokenizer, nltk_treebank_lemmatizer_tokenizer,nltk_TweetTokenizer_stemmer, nltk_TweetTokenizer_lemmatizer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer7(tokenizer))

Tokenizer potter_tokenizer
train score:  0.9703904579654933
test score:  0.9703904579654933
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7444
None
Tokenizer nltk_treebank_stemmer_tokenizer
train score:  0.967061715770095
test score:  0.967061715770095
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7413
None
Tokenizer nltk_treebank_lemmatizer_tokenizer
train score:  0.9810408425908683
test score:  0.9810408425908683
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7281
None
Tokenizer nltk_TweetTokenizer_stemmer
train score:  0.9731402420141827
test score:  0.9731402420141827
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7482
None
Tokenizer nltk_TweetTokeni

In [71]:
def evaluate_tokenizer8(tokenizer):
		
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels

  xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')

  xgbc_param = {
            'max_depth' : [5,6],
            'n_estimators' : [200],
            'learning_rate' : [0.6],
             'min_child_weight' : [1],
              'subsample':[0.6,0.7]
            }

  grid_xgbc = GridSearchCV(xgbc, xgbc_param, cv=3,
                        return_train_score=True, scoring='f1_macro')
  grid_xgbc.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_xgbc.score(X_train, Y_train))
  print('test score: ', grid_xgbc.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_xgbc.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_xgbc.best_score_))

In [72]:
tokenizers= [ nltk_TweetTokenizer_SnowBallstemmer,   nltk_treebank_stemmer_tokenizer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer8(tokenizer))

Tokenizer nltk_TweetTokenizer_SnowBallstemmer
train score:  0.9646802909027682
test score:  0.9646802909027682
Best parameters: {'learning_rate': 0.6, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}
Best cross-validation score: 0.7310
None
Tokenizer nltk_treebank_stemmer_tokenizer
train score:  0.9665972169539022
test score:  0.9665972169539022
Best parameters: {'learning_rate': 0.6, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.7}
Best cross-validation score: 0.7223
None


In [57]:
def evaluate_tokenizer8(tokenizer):
		
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels

  xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')

  xgbc_param = {
            'max_depth' : [6, 8],
            'n_estimators' : [150, 200,300],
            'learning_rate' : [.4, 0.6],
             'min_child_weight' : [1],
              #'subsample':[0.6,0.7,0.8,0.9,1]
            }

  grid_xgbc = GridSearchCV(xgbc, xgbc_param, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_xgbc.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_xgbc.score(X_train, Y_train))
  print('test score: ', grid_xgbc.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_xgbc.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_xgbc.best_score_))

In [58]:
tokenizers= [potter_tokenizer_lemmatizer, potter_tokenizer_lemmatizer, nltk_TweetTokenizer_SnowBallstemmer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer8(tokenizer))

Tokenizer potter_tokenizer_lemmatizer
train score:  0.9724521611031923
test score:  0.9724521611031923
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7474
None
Tokenizer potter_tokenizer_lemmatizer
train score:  0.9724521611031923
test score:  0.9724521611031923
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7474
None
Tokenizer nltk_TweetTokenizer_SnowBallstemmer
train score:  0.9759346866283684
test score:  0.9759346866283684
Best parameters: {'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 300}
Best cross-validation score: 0.7476
None


### XGBoost with Ngram

In [85]:
def evaluate_tokenizer9(tokenizer):
		
  train_corpus, train_labels=read_data(trainFile, tokenizer)
  test_corpus, test_labels=read_data(testFile, tokenizer)
  
  vectorizer = CountVectorizer(max_features=10000, ngram_range=(2, 2), analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
  X_train = vectorizer.fit_transform(train_corpus)
  X_test = vectorizer.transform(test_corpus)

  #le = preprocessing.LabelEncoder()
  #le.fit(train_labels)

  Y_train=train_labels
  Y_test=test_labels

  xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')

  xgbc_param = {
            'max_depth' : [6, 8],
            'n_estimators' : [200,300],
            'learning_rate' : [.4, 0.6,0.7],
             'min_child_weight' : [1],
              #'subsample':[0.6,0.7,0.8,0.9,1]
            }

  grid_xgbc = GridSearchCV(xgbc, xgbc_param, cv=5,
                        return_train_score=True, scoring='f1_macro')
  grid_xgbc.fit(X_train, Y_train)
  print('Tokenizer', tokenizer.__name__)
  print('train score: ', grid_xgbc.score(X_train, Y_train))
  print('test score: ', grid_xgbc.score(X_test, Y_test))

  print("Best parameters: {}".format(grid_xgbc.best_params_))
  print("Best cross-validation score: {:.4f}".format(grid_xgbc.best_score_))

In [86]:
tokenizers= [nltk_TweetTokenizer_stemmer]
for tokenizer in tokenizers:
  print(evaluate_tokenizer9(tokenizer))

Tokenizer nltk_TweetTokenizer_stemmer
train score:  0.9731402420141827
test score:  0.9731402420141827
Best parameters: {'learning_rate': 0.6, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200}
Best cross-validation score: 0.7482
None


# Output File

In [73]:
def read_data_test(filename, tokenizer):
  corpus=[]
    #Y=[]
  k=0
  for i in range(len(filename)):
    #id=dat['Id']
    text=filename['text'][i]
    #print(cols)
    #label= dat['Target'][i]
    tokens=' '.join(tokenizer(text))
    corpus.append(tokens)
    #Y.append(label)
    
 

  return corpus

In [74]:
test_data = pd.read_csv('/content/test.csv', encoding='latin1')
test_data.head()

Unnamed: 0,id,text
0,7322,@AmericanAir In car gng to DFW. Pulled over 1h...
1,7323,"@AmericanAir after all, the plane didnÂÃÂªt ..."
2,7324,@SouthwestAir can't believe how many paying cu...
3,7325,@USAirways I can legitimately say that I would...
4,7326,@AmericanAir still no response from AA. great ...


In [75]:
test_data['text']= test_data.text.str.replace('\n','')

In [76]:
tokenizer=nltk_TweetTokenizer_stemmer

train_corpus, train_labels=read_data(dat, tokenizer)
test_corpus=read_data_test(test_data, tokenizer)

vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)

#le = preprocessing.LabelEncoder()
#le.fit(train_labels)
xgbc= XGBClassifier(random_state=42,early_stopping_rounds=2,objective= 'multi:softmax')
Y_train=train_labels
#Y_test=le.transform(test_labels)
xgbc_param = {
            'max_depth' : [6],
            'n_estimators' : [200],
            'learning_rate' : [0.6],
             'min_child_weight' : [1],
              #'subsample':[0.6,0.7,0.8,0.9,1]
            }

grid_final = GridSearchCV(xgbc, xgbc_param, cv=5,
                        return_train_score=True, scoring='f1_macro')

grid_final.fit(X_train, Y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1,
                                     early_stopping_rounds=2, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='multi:softmax',
                                     random_state=42, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.6], 'max_depth': [6],
                         'min_child_weight': [1],

In [77]:

# Preprocessing of validation data, get predictions
test_data_labels = grid_final.predict(X_test)



In [79]:
# Create predictions to be submitted!
pd.DataFrame({'id':test_data.id, 'Target': test_data_labels}).to_csv('sample_submission.csv', index =False)  
print("Done :D") 

Done :D
