- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

In [85]:
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import re
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [74]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [116]:
stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.remove('not')
def remove_stop_words(text):
    word_tokens = word_tokenize(text) 
    filtered = [token for token in word_tokens if token.lower() not in stop_words]
    filtered = ' '.join(filtered)
    return filtered
def remove_special_characters(text,remove_digits = False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern,'',text)
    return text

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word)for word in text.split()])
    return text
def lemmatize(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word)for word in text.split()])
    return text
def POS(text):
    word_tokens = word_tokenize(text)
    pos = nltk.pos_tag(word_tokens)
    return pos

In [119]:
pos = []
for i, row in train.iterrows():
    new_text = remove_stop_words(train.at[i,'text'])
    new_text = remove_special_characters(train.at[i,'text'], remove_digits = True)
    new_text = lemmatize(new_text)
    new_text = simple_stemmer(new_text)
    pos.append(POS(new_text))
    train.at[i,'text'] = new_text

In [123]:
pos

[[('our', 'PRP$'),
  ('deed', 'NN'),
  ('are', 'VBP'),
  ('the', 'DT'),
  ('reason', 'NN'),
  ('of', 'IN'),
  ('thi', 'NN'),
  ('earthquak', 'NN'),
  ('may', 'MD'),
  ('allah', 'VB'),
  ('forgiv', 'VB'),
  ('u', 'JJ'),
  ('all', 'DT')],
 [('forest', 'JJS'),
  ('fire', 'NN'),
  ('near', 'IN'),
  ('La', 'NNP'),
  ('rong', 'FW'),
  ('sask', 'NN'),
  ('canada', 'NN')],
 [('all', 'DT'),
  ('resid', 'NN'),
  ('ask', 'NN'),
  ('to', 'TO'),
  ('shelter', 'VB'),
  ('in', 'IN'),
  ('place', 'NN'),
  ('are', 'VBP'),
  ('be', 'VB'),
  ('notifi', 'VBN'),
  ('by', 'IN'),
  ('offic', 'JJ'),
  ('No', 'NNP'),
  ('other', 'JJ'),
  ('evacu', 'NN'),
  ('or', 'CC'),
  ('shelter', 'NN'),
  ('in', 'IN'),
  ('place', 'NN'),
  ('order', 'NN'),
  ('are', 'VBP'),
  ('expect', 'VB')],
 [('peopl', 'NN'),
  ('receiv', 'NN'),
  ('wildfir', 'NN'),
  ('evacu', 'NN'),
  ('order', 'NN'),
  ('in', 'IN'),
  ('california', 'NN')],
 [('just', 'RB'),
  ('got', 'VBD'),
  ('sent', 'JJ'),
  ('thi', 'NN'),
  ('photo', 'NN'),
  (

In [125]:
classifier = nltk.DecisionTreeClassifier.train(pos)

ValueError: too many values to unpack (expected 2)

In [77]:
train_x,valid_x,train_y,valid_y = model_selection.train_test_split(train['text'],train['target'])

In [78]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [95]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

tfidf_vect_ngram = TfidfVectorizer(analyzer = 'word', token_pattern= r'w{1,}',ngram_range=(2,3),max_features=5000)
tfidf_vect_ngram.fit(train['text'])
train_vectors = tfidf_vect_ngram.transform(train_x)
valid_vectors = tfidf_vect_ngram.transform(valid_x)

tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x)




In [105]:
all_train = tfidf_vect.transform(train['text'])
all_train_y = train['target']
test_vectors = tfidf_vect.transform(test['text'])

In [89]:
def train_model(classifier,train_vectors,target,valid_vectors):
    classifier.fit(train_vectors,target)
    predictions = classifier.predict(valid_vectors)
    return metrics.accuracy_score(predictions,valid_y)

In [112]:
ridge_regression = train_model(linear_model.RidgeClassifier(),xtrain_tfidf,train_y,xvalid_tfidf)
ridge_regression

0.7988445378151261

In [98]:
log_reg_accuracy = train_model(linear_model.LogisticRegression(),xtrain_tfidf,train_y,xvalid_tfidf)
log_reg_accuracy



0.8077731092436975

In [99]:
nv_accuracy = train_model(naive_bayes.MultinomialNB(),xtrain_tfidf,train_y,xvalid_tfidf)
nv_accuracy

0.7998949579831933

In [100]:
def test_model(classifier,all_train,target,test_vectors):
    classifier.fit(all_train,target)
    predictions = classifier.predict(test_vectors)
    return predictions

In [113]:
predictions = test_model(linear_model.RidgeClassifier(),all_train,all_train_y,test_vectors)

In [114]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['target'] = predictions

In [115]:
sample_submission.to_csv('submisssion.csv',index = False)