In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from wordcloud import WordCloud

In [None]:
from google.colab import files
files.upload()

In [None]:
data = pd.read_csv('Tweets.csv')
data.head(5)

In [None]:
data.shape

In [None]:
airlines = data.airline.unique()
print(airlines)

In [None]:

Negative = len(data[data.airline_sentiment=='negative'])
Neutral = len(data[data.airline_sentiment=='neutral'])
Positive = len(data[data.airline_sentiment=='positive'])
print('negative:',Negative)
print('neutral:',Neutral)
print('positive:',Positive)
print('Total:',len(data))



In [None]:
def get_airline_sentiment(data, airlines):
    positives = []
    negatives = []
    neutrals = []

    for airline in airlines:
        positive = data.query('@airline == airline and airline_sentiment == "positive"')['tweet_id'].count()
        negative = data.query('@airline == airline and airline_sentiment == "negative"')['tweet_id'].count()
        neutral = data.query('@airline == airline and airline_sentiment == "neutral"')['tweet_id'].count()
        
        positives.append(positive)
        negatives.append(negative)
        neutrals.append(neutral)
          
    return positives, negatives, neutrals


positives, negatives, neutrals = get_airline_sentiment(data, airlines)

sentiment_airlines_df = pd.DataFrame(index = airlines);
sentiment_airlines_df['neutrals'] = neutrals
sentiment_airlines_df['positives'] = positives
sentiment_airlines_df['negatives'] = negatives
sentiment_airlines_df

In [None]:
sample_data = [10,700,1150,2200,2500,4000,6050,8666,10089,1240,13000,13500,14000,14200,14500]
for i in sample_data:
      print(data.text[i],'\n','Sentiment:-- ',data.airline_sentiment[i],'\n')

In [None]:
import re
def process_text(text):

  text = re.sub(r'[^\w\s]', '', str(text))
  return text


In [None]:
data['cleaned'] = data['text'].apply(process_text)
sample_data = [10, 700, 1150, 2200, 2500, 4000, 6050, 7666, 8089, 10407, 11000, 12500, 14000, 14200, 14500]
for i in sample_data:
      print('Original:\n',data.text[i],'\n', 'Cleaned:\n', data.cleaned[i],'\n', 'sentiment:\n',data.airline_sentiment[i])

In [None]:
sentiment_airlines_df.plot.bar(rot=0, figsize=(20,5))

plt.title('Sentiment count by airlines')
plt.xlabel('Airlines')
plt.ylabel('number of sentiments')

plt.show()

In [None]:
count = data['airline_sentiment'].count()
print(count)

In [None]:
ax = sns.barplot(x="sentiment_airlines_df", y= "8000", hue="airlines")
ax

In [None]:
sentiment_df = data[['airline_sentiment','text']]
sentiment_df

In [None]:
sentiment_df['airline_sentiment'].hist()

In [None]:
sentiment_encode = {
    
    'negative':0,
    'neutral':1,
    'positive':2
}

In [None]:
sentiment_df['encoded sentiment']= sentiment_df['airline_sentiment'].map(sentiment_encode)
print(sentiment_df.head(5))

In [None]:
data_train, data_test = train_test_split(sentiment_df, train_size=0.30)
print(len(data_train))
print(len(data_test))


In [None]:
print(data_train)
#print(data_test.head(5))

In [None]:
vectorizer = TfidfVectorizer(max_features = 2000)
print(vectorizer)

In [None]:
x_train = vectorizer.fit_transform(data_train['text'])
x_train

In [None]:
x_test = vectorizer.fit_transform(data_test['text'])
x_test

In [None]:
y_train = data_train['encoded sentiment']
y_test = data_test['encoded sentiment']
print(y_train)

In [None]:
data_train

In [None]:
data_test

In [None]:
  lr_model = LogisticRegression(max_iter=400, random_state = 123)


In [None]:
lr_model.fit(x_train, y_train)

In [None]:
lr_model.score(x_train, y_train)

In [None]:
lr_model.score(x_test, y_test)

In [None]:
lr_model.score(x_train, y_train)

In [None]:
lr_model.coef_

In [None]:
y_pred = lr_model.predict(x_train)
print(y_pred)
y_pred1 = lr_model.predict(x_test)
print(y_pred1)

In [None]:
print('accuracy:',accuracy_score(y_test, y_pred1))

In [None]:
plt.hist(lr_model.coef_[1], bins = 40)

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred1)
print(confusion_mat)

In [None]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

In [None]:
dt_model.fit(x_train, y_train)

In [None]:
dt_model.score(x_train, y_train)

In [None]:
dt_model.score(x_test, y_test)

In [None]:
y_pred = dt_model.predict(x_train)
print(y_pred)
y_pred1 = dt_model.predict(x_test)
print(y_pred1)

In [None]:
print('accuracy:',accuracy_score(y_test,y_pred1))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred1)
print(confusion_mat)

In [None]:
 mnb_model = MultinomialNB(alpha=0.15)

In [None]:
mnb_model.fit(x_train, y_train)

In [None]:
mnb_model.score(x_test, y_test)

In [None]:
y_pred = mnb_model.predict(x_train)
print(y_pred)
y_pred1 = mnb_model.predict(x_test)
print(y_pred1)

In [None]:
print('accuracy:',accuracy_score(y_test,y_pred1))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred1)
print(confusion_mat)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3, metric = 'minkowski')

In [None]:
knn_model.fit(x_train, y_train)

In [None]:
knn_model.score(x_test, y_test)

In [None]:
y_pred = knn_model.predict(x_train)
print(y_pred)
y_pred1 = knn_model.predict(x_test)
print(y_pred1)

In [None]:
print('accuracy:',accuracy_score(y_test,y_pred1))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred1)
print(confusion_mat)

In [None]:
 lsvm_model = SVC(kernel = 'linear',C = 0.2, probability=True, random_state = 0)

In [None]:
lsvm_model.fit(x_train, y_train)

In [None]:
lsvm_model.score(x_test, y_test)

In [None]:
y_pred = lsvm_model.predict(x_train)
print(y_pred)
y_pred1 = lsvm_model.predict(x_test)
print(y_pred1)

In [None]:
print('accuracy:',accuracy_score(y_test,y_pred1))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred1)
print(confusion_mat)

In [None]:
rsvm_model = SVC(kernel = 'rbf',C = 0.2, probability=True, random_state = 0)

In [None]:
rsvm_model.fit(x_train, y_train)

In [None]:
rsvm_model.score(x_test, y_test)

In [None]:
y_pred = rsvm_model.predict(x_train)
print(y_pred)
y_pred1 = rsvm_model.predict(x_test)
print(y_pred1)

In [None]:
print('accuracy:',accuracy_score(y_test,y_pred1))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred1)
print(confusion_mat)

In [None]:
classes = ['negative', 'positive', 'neutral'] 

In [None]:
df_cm = pd.DataFrame(confusion_mat, index=classes, columns=classes)

ax = sns.heatmap(df_cm, annot=True, cmap="YlGnBu")
ax.set_xlabel("Predicted")
ax.set_ylabel("Target")

In [None]:
word_index_map = vectorizer.vocabulary_
word_index_map

In [None]:
thresholds= [2,  1.8,  1.5]


In [None]:
print("Most positive words:")

most_positive_words = {}

for word, index in word_index_map.items():
    weight = lr_model.coef_[0][index]
    if weight > thresholds[0]:
        most_positive_words[word] = weight
        print(word, weight)

In [None]:
print("Most positive words:")

most_positive_words = {}

for word, index in word_index_map.items():
    weight = lr_model.coef_[0][index]
    if weight > thresholds[1]:
        most_positive_words[word] = weight
        print(word, weight)

In [None]:
print("Most positive words:")

most_positive_words = {}

for word, index in word_index_map.items():
    weight = lr_model.coef_[0][index]
    if weight > thresholds[2]:
        most_positive_words[word] = weight
        print(word, weight)

In [None]:
most_positive_words_df = pd.DataFrame.from_dict(most_positive_words, columns = ['weights'], orient = 'index')
most_positive_words_df

In [None]:
print("Wordcloud for most positive words")

positive_wordcloud = WordCloud(background_color="white", width=500, height=400)
positive_wordcloud.generate_from_frequencies(most_positive_words)
plt.figure(figsize=(8,8))
plt.imshow(positive_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
most_positive_words_df.plot.bar(rot=0, figsize=(20,5))

plt.title('Most negative words')
plt.xlabel('word')
plt.ylabel('weigth')

plt.show()

In [None]:
print("Most negative words:")

most_negative_words = {}

for word, index in word_index_map.items():
    weight = lr_model.coef_[0][index]
    if weight < -thresholds[0]:
        most_negative_words[word] = weight
        print(word, weight)

In [None]:
print("Most negative words:")

most_negative_words = {}

for word, index in word_index_map.items():
    weight = lr_model.coef_[0][index]
    if weight < -thresholds[1]:
        most_negative_words[word] = weight
        print(word, weight)

In [None]:
print("Most negative words:")

most_negative_words = {}

for word, index in word_index_map.items():
    weight = lr_model.coef_[0][index]
    if weight < -thresholds[2]:
        most_negative_words[word] = weight
        print(word, weight)

In [None]:
most_negative_words_df = pd.DataFrame.from_dict(most_negative_words, columns = ['weights'], orient = 'index')
most_negative_words_df

In [None]:
print("Wordcloud for most negative words")

negative_wordcloud = WordCloud(background_color="white", width=500, height=400)
negative_wordcloud.generate_from_frequencies(most_negative_words)
plt.figure(figsize=(8,8))
plt.imshow(negative_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
most_negative_words_df.plot.bar(rot=0, figsize=(20,5))

plt.title('Most negative words')
plt.xlabel('word')
plt.ylabel('weigth')

plt.show()

In [None]:
stop_words = stopwords.words('english')
without_most_positive_stopword = []

for w in most_positive_words_df:
  if w not in stop_words:
    without_most_positive_stopword.append(w)

print(len(without_most_positive_stopword), len(most_positive_words_df))


In [None]:
stop_words = set(stopwords.words('english'))
without_most_negative_stopword = []

for w in most_negative_words_df:
  if w not in stop_words:
    without_most_negative_stopword.append(w)

print(len(without_most_negative_stopword), len(most_negative_words_df))


In [None]:
filtered_df = pd.DataFrame();
filtered_df['State'] = ["Before", "After"]
filtered_df['Positive'] = [len(most_positive_words), len(most_negative_words)]
filtered_df['Negative'] = [len(without_most_positive_stopword), len(without_most_negative_stopword)]

filtered_df