## Importing Necessary Libraries

In [0]:
import spacy
from spacy import displacy
import pandas as pd
import re
import json   
from pandas.io.json import json_normalize  

In [0]:
nlp = spacy.load('en')

## Loading the saving data as plain text

### Choosing file from local disk

In [3]:
from google.colab import files
uploaded = files.upload()


Saving data.csv to data.csv


### Downloading the Dataset

#### Downloding dataset for citation

In [0]:
#! git clone http://aminer.org/lab-datasets/citation/DBLP_citation_2014_May.zip

#### Downloding dataset for Patent letigation

In [0]:
#! git clone https://www.kaggle.com/uspto/patent-litigations#cases.csv

#### Downloding dataset for Twitter

In [0]:
#! git clone https://www.kaggle.com/c/twitter-sentiment-analysis2/data

### Reading csv files

In [4]:
def readCSVFile():
    # Reading training and test files to list data structures
    data = pd.read_csv("data.csv", error_bad_lines=False, sep = "\t", index_col=False, encoding='latin-1',low_memory=False)
    df = pd.DataFrame(data)
    return df
sentiments = readCSVFile()
#select column from which you want to get value
x = sentiments["SentimentText"].str.replace('http\S+|www.\S+', '', case=False)
x = x.str.replace('[^a-zA-Z0-9-_*.]', ' ')
data = (''.join(x))
data



##Data cleaning and Feature extraction

In [0]:
def Clean_Text(text):
    """
        Input text: a string and flag for stemming on the current text
        return: modified text string which is lower-cased
    """
   
    text = re.sub(r'[(^|\W)\d+]', ' ', text) # removes the occurences of number such as 2019 or 3valued or 21
    #text = re.sub(r'[^0-9a-z#+_]', ' ', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub(r'[/(){}\[\]\|@,;]', ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = ' '.join(word for word in text.split())
    return text

#tokenization of data
def tokenizeData(x):
    for token in x:
        print (token.lemma_)

#part of speech tagging of data
def posTagging(x):
    for token in x:
        print (token.text, token.pos_)


#chunking of data
def chunkingData(x):
    for token in x.noun_chunks:
        print (token.root.text)

#Named Entity Recognition of data
def nerData(x):
    for token in x.ents:
        print (token.text, token.label_)


In [6]:
data = Clean_Text(str(data))
data = nlp(data)
data



##Displaying results

In [7]:
tokenizeData(data)

be
so
sad
for
-PRON-
APL
friend
-PRON-
miss
the
New
Moon
trailer
omg
-PRON-
already
o
Omgaga
-PRON-
be
sooo
-PRON-
be
gunna
CRy
-PRON-
ve
be
at
this
dentist
since
-PRON-
be
supose
just
get
a
crown
put
on
min
i
think
mi
bf
be
cheat
on
-PRON-
T_T
or
i
just
worry
too
much
Juuuuuuuuuuuuuuuuussssst
Chillin
Sunny
Again
Work
tomorrow
TV
tonight
hand
in
-PRON-
uniform
today
i
miss
-PRON-
already
hmmmm
i
wonder
how
-PRON-
-PRON-
number
-PRON-
must
think
about
positive
thank
to
all
the
hater
up
in
-PRON-
face
all
day
this
weekend
have
suck
so
far
jb
be
not
show
in
australia
any
more
ok
that
s
-PRON-
-PRON-
win
lt
this
be
the
way
i
feel
right
now
awhhe
man
-PRON-
m
completely
useless
rt
now
funny
all
-PRON-
can
do
be
twitter
feel
strangely
fine
now
-PRON-
m
go
to
go
listen
to
some
Semisonic
to
celebrate
huge
roll
of
thunder
just
now
so
scary
-PRON-
just
cut
-PRON-
beard
off
-PRON-
s
only
be
grow
for
well
over
a
year
-PRON-
m
go
to
start
-PRON-
over
shaunamanu
be
happy
in
the
meantime
very
sad
abo

In [8]:
posTagging(data)

is VERB
so ADV
sad ADJ
for ADP
my DET
APL PROPN
friend NOUN
I PRON
missed VERB
the DET
New PROPN
Moon PROPN
trailer NOUN
omg NOUN
its DET
already ADV
O INTJ
Omgaga PROPN
I PRON
m VERB
sooo NOUN
i PRON
m VERB
gunna NOUN
CRy NOUN
I PRON
ve VERB
been VERB
at ADP
this DET
dentist NOUN
since ADP
I PRON
was VERB
suposed VERB
just ADV
get VERB
a DET
crown NOUN
put VERB
on ADP
mins NOUN
i PRON
think VERB
mi NOUN
bf NOUN
is VERB
cheating VERB
on ADP
me PRON
T_T PROPN
or CCONJ
i PRON
just ADV
worry VERB
too ADV
much ADJ
Juuuuuuuuuuuuuuuuussssst PROPN
Chillin PROPN
Sunny PROPN
Again PROPN
Work PROPN
Tomorrow NOUN
TV PROPN
Tonight NOUN
handed VERB
in PART
my DET
uniform NOUN
today NOUN
i PRON
miss VERB
you PRON
already ADV
hmmmm VERB
i PRON
wonder VERB
how ADV
she PRON
my DET
number NOUN
I PRON
must VERB
think VERB
about ADP
positive ADJ
thanks NOUN
to ADP
all DET
the DET
haters NOUN
up PART
in ADP
my DET
face NOUN
all DET
day NOUN
this DET
weekend NOUN
has VERB
sucked VERB
so ADV
far ADV
jb INTJ


In [9]:
chunkingData(data)

friend
I
omg
Omgaga
I
i
I
dentist
I
crown
mins
i
bf
me
T_T
i
Tonight
uniform
i
you
i
she
I
thanks
haters
face
weekend
australia
it
you
way
i
man
I
I
I
Semisonic
roll
thunder
I
beard
It
year
I
it
shaunamanu
Iran
wompp
You
who
one
me
you
you
level
I
tweet
Myspace
comp
it
lays
position
Hospitol
Tourny
rd
I
I
something
what
him
me
i
i
rest
life
I
I
hours
dancing
assignment
exams
HELLO
I
it
Geez
girl
atleast
I
it
athlete
ACL
television
i
guys
i
i
jeans
sweater
heels
what
you
Meat
horsie
morning
Sat
days
Room
Sick
Wardrobe
I
Walk
Floyd
relievers
times
pictures
type
spaz
virus
brother
who
MSN
Wiit
Babes
i
something
i
call
someone
office
I
it
I
I
CORNELL
class
what
ginaaa
SHOW
Spiral_galaxy
YMPtweet
it
me
i
reality
Low
motivation
rest
week
entertainment
someone
you
he
melody
Lakers
bathroom
i
congrats
David
end
tonsils
I
Katie
friends
me
love
mom
hug
Harry
Hand
it
leysh
I
what
I
I
i
sooooon
i
haven
t
him
foreverr
I
allergies
I
hair
I
poll
I
guys
it
I
Earl
I
Jersey
I
hour
SYTYCD
I
it
I
fix
I
I


In [10]:
nerData(data)

Juuuuuuuuuuuuuuuuussssst Chillin PERSON
Sunny Again Work Tomorrow TV WORK_OF_ART
Tonight TIME
today DATE
this weekend DATE
Feeling GPE
Semisonic NORP
well over a year DATE
Iran GPE
Sad PERSON
Myspace GPE
Headed NORP
Hospitol PERSON
the Golf Tourny ORG
Feeeling PERSON
I have hours TIME
ACL ORG
today DATE
Meet your Meat WORK_OF_ART
Saturday morning TIME
days DATE
like a million CARDINAL
MSN ORG
last week DATE
CHRIS CORNELL PRODUCT
GO ORG
Muslims NORP
the week DATE
another year DATE
Lakers ORG
David PERSON
five days end of July DATE
Katie PERSON
Sunday DATE
Hand PERSON
tomorrow DATE
Earl PERSON
New Jersey GPE
the first hour of SYTYCD TIME
second ORDINAL
Manuel my Basil PERSON
Miley PERSON
Australia GPE
this morning TIME
Northern Ireland GPE
today DATE
The Beach FAC
DanMerriweather PERSON
Math PERSON
the Human Rights Watch World Report ORG
Jin PERSON
Jus Got PERSON
Much Love Grandpa WORK_OF_ART
Goodbye PERSON
Zahra PERSON
Longest night TIME
sunday DATE
Rats PERSON
tomorrow DATE
two CARDINA