In [79]:
!pip install pyspellchecker
!pip install pyspark
!pip install findspark
!pip install tqdm

!pip install plotly
!pip install cufflinks
import plotly.graph_objs as go





In [76]:
#### for data manipulation and math operations ####
import pandas as pd
import numpy as np

#### for visualizations ####
# plotly
from plotly.offline import iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots

#### NLP packages ####
# NLTK library
from nltk.corpus import stopwords
# SKLearn 
from sklearn.feature_extraction.text import CountVectorizer
# py-spell checker
from spellchecker import SpellChecker


#### other useful packages ####
import string
from collections import Counter
import re
from tqdm import tqdm


#### Pyspark packages ####
import findspark
# findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator



In [4]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [18]:

colnames=['sentiment', 'ids', 'date', 'flag','user','text'] 
train = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, names=colnames) 

train.head()

Unnamed: 0,sentiment,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [19]:
print(f"Shape of training data: {train.shape}")


Shape of training data: (1600000, 6)


In [23]:
train = train[['text','sentiment']]
train.head()


Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   text       1600000 non-null  object
 1   sentiment  1600000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [25]:
# Lets use a subset of the data for faster processing
# Lets use about 100K rows of data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['sentiment'], test_size=0.85, random_state=42)
train = pd.concat([X_train,y_train],axis=1)

In [30]:
len(train)

240000

In [33]:
X_test.head()

541200               @chrishasboobs AHHH I HOPE YOUR OK!!! 
750       @misstoriblack cool , i have no tweet apps  fo...
766711    @TiannaChaos i know  just family drama. its la...
285055    School email won't open  and I have geography ...
705995                               upper airways problem 
Name: text, dtype: object

In [46]:
# lets run a groupby query which has similar functionalities to dealing with RDBMS
class_group_counts = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
class_group_counts.style.background_gradient(cmap='Reds')

Unnamed: 0,sentiment,text
0,0,120144
1,4,119856


In [53]:
# create the trace
trace = go.Pie(
    labels = class_group_counts.sentiment,
    values = class_group_counts.text
)

data = [trace]
layout = go.Layout(title="Pie plot of the distribution of the categorical classes")

fig = go.Figure(data = data,layout=layout)
fig.show()

In [69]:
# lot of work has to be done on cleaning the data.
#cleaning data include 4 function (
#remove_HTML ,
#Removing URLs, 
#remove_emojis,
#Removing Punctuations)


#Removing HTML-tags
def remove_HTML(text):
    """
    Inputs a string and outputs a string free of any HTML tags
    """
    tag = re.compile(r'<.*?>')
    
    return tag.sub(r'',text)





In [70]:
#Removing URLs



def remove_URL(text):
    """
    Inputs a string and outputs a string free of any URLs
    """
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub(r'',text)

In [56]:
#Removing Emojis

def remove_emojis(text):
    """
    Inputs a string and outputs a string free of any emojis
    """
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)
    
    return emoji.sub(r'',text)

In [99]:
#Removing Punctuations 
def remove_punctuations(text):
    """
    Inputs a string and outputs a string free of any punctuations
    """
    punct = re.compile(r'[^\w\s]')
    
    return punct.sub(r'',text)

In [101]:
# now assemble all 4  functions to return a cleaned text

def clean_text(text):
    """
    inputs a string:
    -------------------------------------
    outputs a string free from 
    1) html-tags
    2) urls
    3) emojis
    4)#Removing Punctuations 
    and lastly corrects the misspelled words
    """
    text = remove_HTML(text)
    text = remove_URL(text)
    text = remove_emojis(text)
    text = remove_punctuations(text)
 
    
    return text

In [102]:
#apply dataclean on ourdata train 


corpus = []
for i in tqdm(range(len(train[:10]))):
    text = train.iloc[i]['text']
    corpus.append(clean_text(text))

100%|██████████| 10/10 [00:00<00:00, 5014.71it/s]


In [107]:
corpus

['rhiannarenagade amberbanana my hair smelled like fucking piss when i showered last night ',
 'I dont feel like eating anything ',
 'TWalk Make faces at Aaron for me ',
 'maciejlessthan3 i agree  its not sunny or warm or as joyous as i anticipated',
 'Apparently I do not have enough to do because I just got stuck doing another report that is quottoo large to printquot per by boss ',
 'aplusk sometimes they even take advantage of thatso hard to have people involved in ur life someytimes ',
 'Part 2 of Kathryn and Nates wedding weekend  the wedding Im honored to be a part of their special day  Congrats',
 'Oh so hot ',
 'Wossybookclub    im up for this ',
 'item84jeremy  thanks jeremy']

In [90]:
sc = ps.SparkContext('local[4]')
sqlContext = SQLContext(sc)
print("Just created a SparkContext")


Just created a SparkContext



Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.



In [109]:
cleant = pd.DataFrame(corpus)

In [111]:
cleant.head()

Unnamed: 0,0
0,rhiannarenagade amberbanana my hair smelled li...
1,I dont feel like eating anything
2,TWalk Make faces at Aaron for me
3,maciejlessthan3 i agree its not sunny or warm...
4,Apparently I do not have enough to do because ...


In [128]:
from pyspark.sql import SQLContext

df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('cleaned_train.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [131]:
df.show(10)


+--------------------+------+
|               tweet|target|
+--------------------+------+
|dived many times ...|     0|
|  not the whole crew|     0|
|nope they did not...|     0|
|spring break in p...|     0|
|could not bear to...|     0|
|would ve been the...|     0|
|ahh ive always wa...|     0|
|was out most of t...|     0|
|baked you cake bu...|     0|
|blagh class at to...|     0|
+--------------------+------+
only showing top 10 rows



In [132]:
# Lets split the data for training and testing the model
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)