<a href="https://colab.research.google.com/github/blackwakhu/Currency_exchange-java/blob/master/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import the Libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, precision_score, recall_score

lemma = WordNetLemmatizer()
bag_of_words = TfidfVectorizer(stop_words="english", use_idf=True)

models = [GaussianNB(), DecisionTreeClassifier(), MLPClassifier(max_iter=350), LogisticRegression(), SGDClassifier(), KNeighborsClassifier(), RidgeClassifier()]

models = {str(x): x for x in models}

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Loading the data

In [None]:
from zipfile import ZipFile
with ZipFile("/content/drive/MyDrive/Twitter_Data.csv.zip", "r") as zipobj:
  zipobj.extract("Twitter_Data.csv",path="/content/drive/MyDrive")

df = pd.read_csv("/content/drive/MyDrive/Twitter_Data.csv")

# Data Analysis

In [None]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [None]:
np.unique(df['category'])

array([-1.,  0.,  1., nan])

In [None]:
df = df.replace("nan", np.nan)
df.dropna()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


# Data Preprocessing

In [None]:
def remove_urls(text):
    url_remove = re.compile(r'https?://\S+|www\.\S+')
    return url_remove.sub(r'', str(text))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def lower(text):
    low_text= text.lower()
    return low_text
def remove_num(text):
    remove= re.sub(r'\d+', '', str(text))
    return remove

def punct_remove(text):
    punct = re.sub(r"[^\w\s\d]","", str(text))
    return punct
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    STOPWORDS = set(stopwords.words("english"))
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
def remove_mention(x):
    text=re.sub(r'@\w+','',x)
    return text
def remove_space(text):
    space_remove = re.sub(r"\s+"," ",text).strip()
    return space_remove
def remove_hash(x):
    text=re.sub(r'#\w+','',x)
    return text




In [None]:
df['text_clean'] = df['clean_text'].apply(lambda x:remove_urls(x))
df['text_clean'] = df['text_clean'].apply(lambda x:remove_html(x))
df['text_clean'] = df['text_clean'].apply(lambda x:lower(x))
df['text_clean'] = df['text_clean'].apply(lambda x:remove_num(x))
df['text_clean'] = df['text_clean'].apply(lambda x:punct_remove(x))
df['text_clean'] = df['text_clean'].apply(lambda x:remove_stopwords(x))
df['text_clean'] = df['text_clean'].apply(lambda x:remove_mention(x))
df['text_clean'] = df['text_clean'].apply(lambda x:remove_hash(x))
df['text_clean'] = df['text_clean'].apply(lambda x:remove_space(x))

In [None]:
def lemmatize_data(text):
  tokens = word_tokenize(text)
  return ' '.join([lemma.lemmatize(word) for word in tokens])

df['text_lemma'] = df['text_clean'].apply(lambda x: lemmatize_data(x))

In [None]:
df.head()

Unnamed: 0,clean_text,category,text_clean,text_lemma
0,when modi promised “minimum government maximum...,-1.0,modi promised minimum government maximum gover...,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...,asking supporter prefix chowkidar name modi gr...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...,answer among powerful world leader today trump...


In [None]:
df = df.dropna()
X = df['text_lemma']
y = df['category']

In [None]:
xtfi = bag_of_words.fit_transform(X)
svd = TruncatedSVD(n_components=10, algorithm="arpack")
x_svd = svd.fit_transform(xtfi)

# Model Selection

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_svd, y, test_size=0.35, random_state=0)

In [None]:
x_train.shape

(105929, 10)

# Data training

In [None]:
for model in models.items():
    print("model: {}".format(model[0]))
    y_pred = model[1].fit(x_train, y_train).predict(x_test)
    print("accuracy score: {}".format(accuracy_score(y_test, y_pred)))
    print("f1_score: {}".format(f1_score(y_test, y_pred, average='micro')))
    print("balanced accuracy score: {}".format(balanced_accuracy_score(y_test, y_pred)))
    print("precision score: {}".format(precision_score(y_test, y_pred, average='micro')))
    print("recall score: {}".format(recall_score(y_test, y_pred, average='micro')))
    print()
    print()

model: GaussianNB()
accuracy score: 0.3406556802244039
f1_score: 0.3406556802244039
balanced accuracy score: 0.37997980908398915
precision score: 0.3406556802244039
recall score: 0.3406556802244039


model: DecisionTreeClassifier()
accuracy score: 0.46953015427769984
f1_score: 0.46953015427769984
balanced accuracy score: 0.4431847847760379
precision score: 0.46953015427769984
recall score: 0.46953015427769984


model: MLPClassifier(max_iter=350)
accuracy score: 0.5479312762973352
f1_score: 0.5479312762973352
balanced accuracy score: 0.46499390532071777
precision score: 0.5479312762973352
recall score: 0.5479312762973352


model: LogisticRegression()
accuracy score: 0.49328541374474055
f1_score: 0.49328541374474055
balanced accuracy score: 0.40484953984275734
precision score: 0.49328541374474055
recall score: 0.49328541374474055


model: SGDClassifier()
accuracy score: 0.4934256661991585
f1_score: 0.4934256661991585
balanced accuracy score: 0.4006087233872761
precision score: 0.49342566

the model with the highest accuracy score was the neural network. test to see the effects of increasing the max_iterations