In [None]:
import os
import tweepy as tw
import pandas as pd
import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


In [None]:
csvFile = open('Tweets.csv', 'a')
csvWriter = csv.writer(csvFile)


In [None]:
for tweet in tw.Cursor(api.search,
                           q=HashValue,
                           count=20,
                           lang="ar",
                           since=date_since,
                           tweet_mode='extended').items(300):
    
    print (tweet.created_at, tweet.full_text)
    csvWriter.writerow([tweet.created_at, tweet.full_text.encode('utf-8-sig')])

print ("Scraping finished and saved to "+HashValue+".csv")

In [None]:
#Collecting data

tweets = tw.Cursor(api.search,q=HashValue,lang="ar", since=date_since).items(300)

users_locs = [[tweet.text,tweet.user.screen_name, tweet.user.location] for tweet in tweets]

tweet_text = pd.DataFrame(data=users_locs,  columns=['tweet','user', "location"])

pd.set_option("display.max_rows", None, "display.max_columns", None)

tweet_text.to_csv('1.csv')

tweet_text #unlabeld dataset


In [None]:
#labeld dataset 
data = pd.read_csv('Riyadh_Season.csv')
# Keeping only the neccessary columns
data = data[['tweet','Category']]
data


In [None]:
#Cleaning data

def clean_text(text):  
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى",
              "\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا",
               "","","","ي","",' ', ' ',' ',' ? ',' ؟ ', ' ! ']
    
    tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(tashkeel,"", text)  # remove Shadda,Fatha,Tanwin,Kasra..
    
    longation = re.compile(r'(.)\1+') 
    subst = r"\1\1"
    text = re.sub(longation, subst, text)   #remove longation  [إأآا]", "ا" or "ة", "ه",  ...
    
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"[a-zA-Z]", '', text)
    text = re.sub(r"\d+", ' ', text)
    text = re.sub(r"\n+", ' ', text)
    text = re.sub(r"\t+", ' ', text)
    text = re.sub(r"\r+", ' ', text)
    text = re.sub(r"\s+", ' ', text)
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #text = data.strip()
    
    return text

In [None]:


data['cleaned_text'] = data.tweet.apply(clean_text)

data = data[data.cleaned_text != ""] #reomve any empty fields ?
data.head(10)


In [None]:
#num_samples in each category (Positive 1,negative -1,nautral 0)
#its a unbalanced dataset
data.groupby(['Category']).count()

In [None]:
#dealing with the unbalance 

min_sample = data.groupby(['Category']).count().cleaned_text.min()
b_data = pd.concat([data[data.Category == 1].head(min_sample), 
                        data[data.Category == -1].head(min_sample),
                     data[data.Category == 0].head(min_sample)])

b_data.groupby(['Category']).count()

In [None]:
X = data.cleaned_text.values #input

Y = data.Category.values.astype('float32') #target


In [None]:
#Tokenization Process

maxlen = 300
max_fatures = 800
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['cleaned_text'].values)
X = tokenizer.texts_to_sequences(data['cleaned_text'].values)
X = pad_sequences(X, padding='post', maxlen=maxlen)
X


In [None]:
seed=20

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
                                                            random_state=seed)

print("Training:", len(X_train), len(Y_train))
print("Testing: ", len(X_test), len(Y_test))

In [None]:
from keras import layers
from keras.backend import clear_session

embedding_dim = 200
dropout = 0.5
opt = 'adam'
clear_session()

model = Sequential()
model.add(layers.Embedding(input_dim=max_fatures, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(50, dropout=dropout, 
                                           recurrent_dropout=dropout, 
                                           return_sequences=True)))

model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1, activation='relu'))

model.compile(optimizer=opt, 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
model.summary()

In [None]:
#Training

history = model.fit(X_train, Y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, Y_test),
                    batch_size=64)

loss, accuracy = model.evaluate(X_train, Y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))

#TESTING ?
loss_val, accuracy_val = model.evaluate(X_test, Y_test, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy_val))

In [None]:
#Compre results:

df_blind = pd.DataFrame({'REAL': Y_blind, 
                         'PRED': pred_blind.reshape(pred_blind.shape[0],), 
                         'TEXT': blind_test.cleaned_text})
df_blind = df_blind.reset_index()[['REAL', 'PRED', 'TEXT']]
df_blind.PRED = df_blind.PRED.round()
error_records = df_blind[df_blind.REAL != df_blind.PRED]
print("Number of misclassified reviews: {} out of {}".format(error_records.shape[0], df_blind.shape[0]))
print("Blind Test Accuracy:  {:.4f}".format(accuracy_score(df_blind.REAL, df_blind.PRED)))

In [None]:
#Sample outputs:

df_blind.sample(n=3)