<a href="https://colab.research.google.com/github/benza613/CS583-Research-Project/blob/main/DataPre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/gdrive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT) 

In [None]:
%cd gdrive/MyDrive/DMTM/

In [None]:
%ls

In [None]:
!pip install tweet-preprocessor

In [None]:
import pandas as pd
import preprocessor as p
import re
import string
import seaborn as sns
import nltk 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#data=pd.read_excel('training-Obama-Romney-tweets.xlsx',sheet_name='Obama')  
DATAFILE = 'training-Obama-Romney-tweets.xlsx'
df_obama = pd.read_excel(DATAFILE,sheet_name='Obama', header=None,index_col=None, usecols='B:F', 
                   skiprows=[0,1], names=['date','time','tweet','class','result'])
print(df_obama)

In [None]:
#explore the data
print(f'Number of variables - {df_obama.shape[1]}\n')
print(f'Data Types for each variable - \n{df_obama.dtypes}\n')
print(f'Number of variables for each data type - \n{df_obama.dtypes.value_counts()}')

In [None]:
sns.countplot(x = 'class', data = df_obama)

In [None]:
# Drop rows not having 0, 1, -1
df_obama['class'] = df_obama['class'].astype(str)
df_obama = df_obama[df_obama['class'].isin(['0', '1', '-1'])] 
#df_romney = df_romney[df_romney['class'].isin(['0', '1', '-1'])] 

# Print the shape of the dataframe 
print(df_obama.shape) 
sns.countplot(x = 'class', data = df_obama)

In [None]:
df_obama['tweet'] = df_obama['tweet'].apply(lambda _: str(_))
df_obama.info()

In [None]:
df_obama['tweet_length'] = df_obama['tweet'].apply(lambda x: len(str(x)))

In [None]:
df_obama.head(10)

In [None]:
#Basic cleaning using tweet-preprocessor
df_obama['tweet'] = df_obama['tweet'].apply(lambda x: p.clean(x))
df_obama.head(10)

In [None]:
#removing the html tags
df_obama['tweet'] = df_obama['tweet'].apply(lambda x: re.sub(re.compile('<[^>]+>'), '', x))
df_obama.head(10)

In [None]:
#removing the extra puncations
df_obama['tweet'] = df_obama['tweet'].apply(lambda x: re.sub('[0-9]+', '', "".join([char for char in x if char not in string.punctuation])))
df_obama.head(10)

In [None]:
# #Tokenization using tweet-preprocessor
# df_obama['tweet_token'] = df_obama['tweet'].apply(lambda x: re.split('\W+', x))
# df_obama.head(10)

In [None]:
# #remove stop words
#stopword = nltk.corpus.stopwords.words('english')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
# df_obama['tweet_token'] = df_obama['tweet'].apply(lambda x: [word for word in x if word not in stopword])
# df_obama.head(10)

In [None]:
def clean_stopwords(tweet):
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    return clean_mess

In [None]:
def normalization(tweet_list):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet

In [None]:
df_obama['tweet_clean'] = df_obama['tweet'].apply(lambda x: clean_stopwords(x))
df_obama.head(10)
df_obama['tweet_clean'] = df_obama['tweet_clean'].apply(lambda x: normalization(x))
df_obama.head(10)

In [None]:
def clean_text(text):
    text_lc = "".join([word.lower() for word in text if word not in string.punctuation]) # remove puntuation
    text_rc = re.sub('[0-9]+', '', text_lc)
    tokens = re.split('\W+', text_rc)    # tokenization
    text = [ps.stem(word) for word in tokens if word not in stopword]  # remove stopwords and stemming
    return text

In [None]:
df_obama['tweet_clean']

**Training and testing data split**

In [None]:
df_obama.info()

In [None]:
df_obama.drop(columns=['date', 'time','result','tweet_length','tweet_clean'])

In [None]:
CLX_train,CLX_val, CLY_train, CLY_val = train_test_split(df_obama.iloc[:], df_obama['class'], test_size = 0.2, random_state=0)

In [None]:
CLX_train['tweet_clean']

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=20000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = tfidfconverter.fit_transform(df_obama['tweet']).toarray()

In [None]:
y=df_obama['class']

In [None]:

print(tfidfconverter.get_feature_names())

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)  
text_classifier.fit(X_train, y_train)


In [None]:
predictions = text_classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words= MAX_VOCAB_SIZE, filters='#$%&()*+<=>@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(df_obama['class'].values + ' DELIM '+ df_obama['tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(df_obama['class'].values + ' DELIM '+ df_obama['tweet'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(df_obama['class']).values
print('Shape of label tensor:', Y.shape)

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_VOCAB_SIZE = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 200

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(200, dropout = 0.2))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy',f1_m,precision_m, recall_m])

# run for small number of epochs then save 
epochs = 3

history = model.fit(X, Y, epochs=epochs)