## Importing Necessary Libraries

In [0]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import nltk
import nltk as nk
from nltk import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import PunktSentenceTokenizer,word_tokenize, sent_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords
import re

### Installing NLTK

In [3]:
 ! pip install -U nltk

 nltk.download('punkt')
 nltk.download("popular")

Requirement already up-to-date: nltk in /usr/local/lib/python3.6/dist-packages (3.4.5)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    | 

True

### Loading the saved data as plain text

### Choosing file from local disk

In [4]:
from google.colab import files
uploaded = files.upload()


Saving data.csv to data.csv


### Downloading the Dataset

#### Dataset for Detailed Patent Litigation

In [0]:
#! git clone https://www.kaggle.com/uspto/patent-litigations#cases.csv

#### Dataset for citation

In [0]:
#! git clone http://aminer.org/lab-datasets/citation/DBLP_citation_2014_May.zip

#### Dataset for Twitter

In [0]:
#! git clone https://www.kaggle.com/c/twitter-sentiment-analysis2/data

### Reading csv files

In [5]:
def readCSVFile():
    # Reading training and test files to list data structures
    data = pd.read_csv("data.csv", error_bad_lines=False, sep = "\t", index_col=False, encoding='latin-1',low_memory=False)
    df = pd.DataFrame(data)
    return df
sentiments = readCSVFile()
x = sentiments["SentimentText"].str.replace('http\S+|www.\S+', '', case=False)
text = x.str.replace('[^a-zA-Z0-9-_*.]', ' ')
data = (', '.join(text))
data



## Data preperation - Feature Extraction

In [0]:
lemmatizer = WordNetLemmatizer() # For word lemmatization
stemmer = PorterStemmer() # For word Stemming
REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|@,;]') 
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')
REMOVING_NUMBERS = re.compile("(^|\W)\d+")
STOPWORDS = set(stopwords.words('english'))

def Nltk2Word_And_Tag(nltk_tag):
    """
        Input text: a string
        return: string tag such as 'a','v','n','r'
    """
    if nltk_tag.startswith('J'):
        return wordnet.wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.wordnet.ADV
    else:        
        return None
    
def Lemmatize_Sentence(sentence):
    """
        Input text: a string 
        return: lemmatized string
    """
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    w_n_tagged = map(lambda x: (x[0], Nltk2Word_And_Tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in w_n_tagged:
        if tag is None: 
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

def Stem_Sentence(sentence):
    """
        Input text: a string
        Description: Can be applied for varying the analysis of Classifier
        return: stemmed string
    """
    tokenize_words = nltk.word_tokenize(sentence)
    stem_sents = [stemmer.stem(word) for word in tokenize_words]
    return " ".join(stem_sents)

def Length_Words_Disapproved(sentence,length):
    """
        Input text: a string and int for defining the limit on the length of the words that will be allowed
        return: modified text string
    """
    tokenize_words = nltk.word_tokenize(sentence)
    sent = list(filter(lambda x: len(x) > length,tokenize_words))
    return " ".join(sent) 

def Clean_Text(text,flag):
    """
        Input text: a string and flag for stemming on the current text
        return: modified text string which is lower-cased
    """
    text = text.lower() # lowercase text
    text = REMOVING_NUMBERS.sub(" ", text) # removes the occurences of number such as 2019 or 3valued or 21
    text = BAD_SYMBOLS.sub(" ", text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = REPLACE_BY_SPACE.sub(" ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text 
    if(flag):
        text = Stem_Sentence(text)
    
    return Lemmatize_Sentence(Length_Words_Disapproved(text,3))

#part of speech tagging of data
def posTagging(x):
    tokenization = word_tokenize(x)
    tagged = nltk.pos_tag(tokenization)
    return tagged

#chunking of data
taggedData = posTagging(data)
def chunkingData(x):
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(x)
    print (chunked)
    chunked.draw()

#Named Entity Recognition of data
def nerData(x):
    namedEnt = nltk.ne_chunk (x, binary = True)
    print (namedEnt)

##Checking the results


In [7]:
data = Clean_Text(data,1)
data

'friend miss moon trailer alreadi omgaga sooo gunna dentist sinc supos crown think cheat worri much juuuuuuuuuuuuuuuuussssst chillin sunni work tomorrow tonight hand uniform today miss alreadi hmmmm wonder number must think posit thank hater face weekend suck isnt show australia that feel right awhh complet useless funni twitter feel strang fine listen semison celebr huge roll thunder scari beard grow well year start shaunamanu happi meantim iran wompppp wompp caus follow pretti awesom level write massiv blog tweet myspac comp shut lose fetal posit head hospitol pull golf tourni place think someth yeah bore what wrong pleas tell bother wish could spend rest life serious feeel like shit right realli want sleep nooo hour danc assign finish goodby exam hello alcohol tonight realiz deep geez give girl warn atleast hate athlet appear tear live televis miss think wear skinni jean cute sweater heel realli sure today meet meat horsi move saturday morn need work week realli dont like room bore 

In [8]:
Lemmatize_Sentence(data)

'friend miss moon trailer alreadi omgaga sooo gunna dentist sinc supos crown think cheat worri much juuuuuuuuuuuuuuuuussssst chillin sunni work tomorrow tonight hand uniform today miss alreadi hmmmm wonder number must think posit thank hater face weekend suck isnt show australia that feel right awhh complet useless funni twitter feel strang fine listen semison celebr huge roll thunder scari beard grow well year start shaunamanu happi meantim iran wompppp wompp caus follow pretti awesom level write massiv blog tweet myspac comp shut lose fetal posit head hospitol pull golf tourni place think someth yeah bore what wrong pleas tell bother wish could spend rest life serious feeel like shit right realli want sleep nooo hour danc assign finish goodby exam hello alcohol tonight realiz deep geez give girl warn atleast hate athlet appear tear live televis miss think wear skinni jean cute sweater heel realli sure today meet meat horsi move saturday morn need work week realli dont like room bore 

In [9]:
posTagging(data)

[('friend', 'NN'),
 ('miss', 'VBZ'),
 ('moon', 'RB'),
 ('trailer', 'JJ'),
 ('alreadi', 'NN'),
 ('omgaga', 'NN'),
 ('sooo', 'NN'),
 ('gunna', 'NN'),
 ('dentist', 'NN'),
 ('sinc', 'NN'),
 ('supos', 'VBD'),
 ('crown', 'JJ'),
 ('think', 'NN'),
 ('cheat', 'NN'),
 ('worri', 'VBP'),
 ('much', 'JJ'),
 ('juuuuuuuuuuuuuuuuussssst', 'NN'),
 ('chillin', 'NN'),
 ('sunni', 'NN'),
 ('work', 'NN'),
 ('tomorrow', 'NN'),
 ('tonight', 'JJ'),
 ('hand', 'NN'),
 ('uniform', 'NN'),
 ('today', 'NN'),
 ('miss', 'VBP'),
 ('alreadi', 'NN'),
 ('hmmmm', 'NN'),
 ('wonder', 'VBP'),
 ('number', 'NN'),
 ('must', 'MD'),
 ('think', 'VB'),
 ('posit', 'NN'),
 ('thank', 'NN'),
 ('hater', 'NN'),
 ('face', 'NN'),
 ('weekend', 'NN'),
 ('suck', 'VBD'),
 ('isnt', 'JJ'),
 ('show', 'NN'),
 ('australia', 'VBZ'),
 ('that', 'IN'),
 ('feel', 'VB'),
 ('right', 'JJ'),
 ('awhh', 'NN'),
 ('complet', 'NN'),
 ('useless', 'JJ'),
 ('funni', 'NN'),
 ('twitter', 'NN'),
 ('feel', 'NN'),
 ('strang', 'VBD'),
 ('fine', 'JJ'),
 ('listen', 'JJ'),
 (

In [10]:
chunkingData(taggedData)

(S
  is/VBZ
  so/RB
  sad/JJ
  for/IN
  my/PRP$
  (Chunk APL/NNP friend/NN)
  .../:
  .../:
  .../:
  .../:
  ./.
  ,/,
  I/PRP
  missed/VBD
  the/DT
  (Chunk New/NNP Moon/NNP trailer/NN)
  .../:
  ,/,
  omg/VB
  its/PRP$
  already/RB
  7/CD
  30/CD
  (Chunk O/NNP)
  ,/,
  (Chunk ../NNP Omgaga/NNP)
  ./.
  (Chunk Im/NNP)
  sooo/VBD
  im/JJ
  gunna/NN
  (Chunk CRy/NNP)
  ./.
  I/PRP
  ve/VBP
  been/VBN
  at/IN
  this/DT
  dentist/NN
  since/IN
  11../CD
  I/PRP
  was/VBD
  suposed/VBN
  2/CD
  just/RB
  get/VB
  a/DT
  crown/NN
  put/VBD
  on/IN
  30mins/CD
  .../:
  ,/,
  i/JJ
  think/VBP
  mi/JJ
  bf/NN
  is/VBZ
  cheating/VBG
  on/IN
  me/PRP
  (Chunk T_T/NNP)
  ,/,
  or/CC
  i/VB
  just/RB
  worry/VB
  too/RB
  much/JJ
  ,/,
  (Chunk Juuuuuuuuuuuuuuuuussssst/NNP Chillin/NNP)
  ,/,
  (Chunk Sunny/NNP Again/NNP Work/NNP Tomorrow/NNP)
  -/:
  TV/NN
  (Chunk Tonight/NNP)
  ,/,
  handed/VBN
  in/IN
  my/PRP$
  uniform/JJ
  today/NN
  ./.
  i/VB
  miss/VBP
  you/PRP
  already/RB
  ,/,
  h

TclError: ignored

In [11]:
nerData(taggedData)

(S
  is/VBZ
  so/RB
  sad/JJ
  for/IN
  my/PRP$
  APL/NNP
  friend/NN
  .../:
  .../:
  .../:
  .../:
  ./.
  ,/,
  I/PRP
  missed/VBD
  the/DT
  (NE New/NNP Moon/NNP)
  trailer/NN
  .../:
  ,/,
  omg/VB
  its/PRP$
  already/RB
  7/CD
  30/CD
  O/NNP
  ,/,
  ../NNP
  Omgaga/NNP
  ./.
  Im/NNP
  sooo/VBD
  im/JJ
  gunna/NN
  CRy/NNP
  ./.
  I/PRP
  ve/VBP
  been/VBN
  at/IN
  this/DT
  dentist/NN
  since/IN
  11../CD
  I/PRP
  was/VBD
  suposed/VBN
  2/CD
  just/RB
  get/VB
  a/DT
  crown/NN
  put/VBD
  on/IN
  30mins/CD
  .../:
  ,/,
  i/JJ
  think/VBP
  mi/JJ
  bf/NN
  is/VBZ
  cheating/VBG
  on/IN
  me/PRP
  T_T/NNP
  ,/,
  or/CC
  i/VB
  just/RB
  worry/VB
  too/RB
  much/JJ
  ,/,
  (NE Juuuuuuuuuuuuuuuuussssst/NNP Chillin/NNP)
  ,/,
  (NE Sunny/NNP Again/NNP Work/NNP Tomorrow/NNP)
  -/:
  TV/NN
  Tonight/NNP
  ,/,
  handed/VBN
  in/IN
  my/PRP$
  uniform/JJ
  today/NN
  ./.
  i/VB
  miss/VBP
  you/PRP
  already/RB
  ,/,
  hmmmm/NN
  .../:
  ./.
  i/VB
  wonder/VBP
  how/WRB
  she/P