<a href="https://colab.research.google.com/github/adefirmanf/global-warming-sentiment/blob/main/Global_Warming_Exist%3F_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
# Initialization 
import seaborn as sns
import pandas as pd 
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('punkt')

from sklearn import model_selection, naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.sentiment import SentimentAnalyzer
from sklearn.metrics import accuracy_score

nltk.download('stopwords')

lemmatizer = WordNetLemmatizer() 
  

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Fetching Data
data = pd.read_csv('https://query.data.world/s/y6agr3tybufuvkmq7zvlmat27ehuz6', sep=',', encoding='windows-1252', engine='python')
data_df = pd.DataFrame(data);
    
# Preprocessing Data (Removing invalid Y Data) - Phase 1 
if 'existence.confidence' in data_df.columns:
  data_df.pop('existence.confidence');


# 1. Converting the {Y, Yes} = 1, {N, No} = 2
yes_index = data_df[(data_df['existence'] == 'Y') | (data_df['existence'] == 'Yes') ].index
no_index = data_df[(data_df['existence'] == 'N') | (data_df['existence'] == 'No') ].index

data_df.loc[yes_index,'existence_int'] = 1
data_df.loc[no_index,'existence_int'] = 0

# 2. Remove NA & N/A Existence
data_df = data_df.dropna()

for i, v in data_df.iterrows():
  # Lowering case
  tweet = v['tweet'].lower()
  
  final_text = []
  # Tokenize 
  stop_words = set(stopwords.words('english'))
  tokenize = nltk.tokenize.word_tokenize(tweet)
  for word in tokenize:
    if word not in stopwords.words('english') and word.isalpha():
      final_text.append(lemmatizer.lemmatize(word))
  # Since panda doesn't recommending store the array data, we should join 
  # the data. 
  # Also we should remove the http/link keywords in last position
  final_text.pop(len(final_text)-1)
  data_df.loc[i, 'final_words'] = " ".join(final_text)
  
X = data_df['final_words']
Y = data_df['existence_int']

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X, Y,test_size=0.3)

Tfidf_vect = TfidfVectorizer(max_features=10000)
c = Tfidf_vect.fit_transform(data_df['final_words'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

print(Test_X)


1840      book geoengineering change climate conversation
2939    india seek clarity equity climate change flow ...
2547    mrsjojoxx science disproves global warming ala...
2172    make case climate change release glossy report...
3694    climate change invasive specie http via climat...
                              ...                        
3374    rt josiedc due snow course rt senate global wa...
5642    nasa climate change facebook become fan keep c...
3537    rt davidcorndc someone please explain conserva...
5826     evidence climate change cause earthquake volcano
2116    plan b california brace climate change califor...
Name: final_words, Length: 1268, dtype: object


In [58]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
print(predictions_NB)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

[1. 1. 1. ... 1. 1. 1.]
Naive Bayes Accuracy Score ->  78.15457413249212


In [59]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  82.09779179810725


In [101]:
Manual_Test = pd.DataFrame(['rt'])
Manual_Test_Tdidf = Tfidf_vect.transform(Manual_Test[0])

SVM.predict(Manual_Test_Tdidf)

array([1.])

In [None]:
terms = Tfidf_vect.get_feature_names()

# sum tfidf frequency of each term through documents
sums = c.toarray().sum(axis=0)

data = []
for col, term in enumerate(terms):
    data.append((term, sums[col]))

ranking = pd.DataFrame(data, columns=['term','rank'])

# Now the documents of the datasets mostly talked about
# Global Warming.
print(ranking.sort_values(by='rank', ascending=False))

          term        rank
2324    global  232.347570
976    climate  227.263256
5800   warming  218.307623
871     change  216.914151
2638      http  135.799525
...        ...         ...
2640     huang    0.270175
6002   xiaoyan    0.270175
3077       kun    0.270175
2950  jingstri    0.260227
3712     nolte    0.253574

[6051 rows x 2 columns]
