In [18]:
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
df = pd.read_csv('Twitter_Data.csv')

In [3]:
def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ", tweet)
    only_letters = only_letters.lower()
    only_letters = only_letters.split()
    filtered_result = [word for word in only_letters if word not in stopwords.words('english')]
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    lemmas = ' '.join(lemmas)
    return lemmas

In [4]:
df = shuffle(df)
y = df['airline_sentiment']
x = df.text.apply(normalizer)

In [7]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)

In [8]:
x_train, x_val, y_train, y_val = train_test_split(x_vectorized,y)

In [9]:
regressor = LogisticRegression(multi_class='multinomial', solver='newton-cg')
model = regressor.fit(x_train, y_train)

In [10]:
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(x_train, y_train)
model = gs_clf.best_estimator_

In [11]:
y_pred = model.predict(x_val)

_f1 = f1_score(y_val, y_pred, average='micro')
_confusion = confusion_matrix(y_val, y_pred)
__precision = precision_score(y_val, y_pred, average='micro')
_recall = recall_score(y_val, y_pred, average='micro')
_statistics = {'f1_score': _f1,
               'confusion matrix': _confusion,
               'precision': __precision,
               'recall': _recall
               }

In [12]:
print(_statistics)

{'f1_score': 0.7852459016393442, 'confusion matrix': array([[2049,  196,   53],
       [ 275,  432,   69],
       [ 109,   84,  393]], dtype=int64), 'precision': 0.7852459016393443, 'recall': 0.7852459016393443}


In [14]:
test_feature = vectorizer.transform(['This is an example of a Negative tweet: he is an awful person'])
model.predict(test_feature)

array(['negative'], dtype=object)

In [15]:
test_feature = vectorizer.transform(['You are a wonderful person'])
model.predict(test_feature)

array(['positive'], dtype=object)

In [16]:
test_feature = vectorizer.transform(["i'm okay"])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [17]:
test_feature = vectorizer.transform(['this is an apple'])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [19]:
pickl = {'vectorizer': vectorizer,
         'model': model
         }
pickle.dump(pickl, open('models'+".p", "wb"))

In [None]:
df2 = df.head(100)
df2.to_csv('sample_file.csv')