In [5]:
!pip install spacy



In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
document = open('gdrive/My Drive/Colab Notebooks/text.txt').read()
document = nlp(document)
dir(document)
print(document[0])

Hi


In [10]:
list(document.sents)

[Hi Friends, I am Yokesh, completed Bachelors degree in Engineering with 8.46 CGPA.,
 Currently I am doing internship in TCS RIO-125 with the project Automate detection and recognition of Grammatical Errors.]

In [11]:
all_tags = {w.pos: w.pos_ for w in document}

for word in list(document.sents)[0]:
  print(word, word.tag_)

Hi UH
Friends NNS
, ,
I PRP
am VBP
Yokesh NNP
, ,
completed VBD
Bachelors NNPS
degree NN
in IN
Engineering NNP
with IN
8.46 CD
CGPA NNP
. .


In [12]:
[chunk.text for chunk in document.noun_chunks]

['I',
 'Yokesh',
 ', completed Bachelors degree',
 'Engineering',
 '8.46 CGPA',
 'I',
 'internship',
 'TCS RIO-125',
 'the project Automate detection',
 'recognition',
 'Grammatical Errors']

In [13]:
from spacy import displacy
displacy.render(document, style='dep')
displacy.render(document, style='ent')

'<div class="entities" style="line-height: 2.5; direction: ltr">Hi Friends, I am Yokesh, completed Bachelors degree in \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Engineering\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GPE</span>\n</mark>\n with \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    8.46\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">CARDINAL</span>\n</mark>\n CGPA. Currently I am doing internship in \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    TCS\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; bo

In [14]:
#define some parameters
noisy_pos_tags = ["PROP"]
min_token_length = 2

#Function to check if the token is a noise or not
def isNoise(token):
  is_noise = False
  if token.pos_ in noisy_pos_tags:
    is_noise = True
  elif token.is_stop == True:
    is_noise = True
  elif len(str(token)) <= min_token_length:
    is_noise = True
  return is_noise
def cleanup(token, lower = True):
  if lower:
    token = token.lower()
  return token.strip()

#top unigrams used in the reviews
from collections import Counter
cleaned_list = [cleanup(str(word)) for word in document if not isNoise(word)]
Counter(cleaned_list) .most_common(5)

# Entity Detection
print("\nEntity Detection...\n")
label = set([w.label_ for w in document.ents])
for i in label:
  entities = [cleanup(str(e), lower=False) for e in document.ents if i==e.label_]
  entities = list(set(entities))
  print(i, entities)

#Displaying tokens
for token in document:
  print (token.text, token.tag_, token.head.text, token.dep_)
#  Displaying Noun Phrases
print("\nDisplaying Noun Phrases")
for np in document.noun_chunks:
  print(np.text, np.root.dep_, np.root.head.text)




Entity Detection...

CARDINAL ['8.46']
ORG ['Grammatical Errors', 'TCS']
GPE ['Automate', 'Engineering']
Hi UH am intj
Friends NNS Hi npadvmod
, , am punct
I PRP am nsubj
am VBP am ROOT
Yokesh NNP am attr
, , degree punct
completed VBD degree amod
Bachelors NNPS degree compound
degree NN am attr
in IN degree prep
Engineering NNP in pobj
with IN degree prep
8.46 CD CGPA nummod
CGPA NNP with pobj
. . am punct
Currently RB doing advmod
I PRP doing nsubj
am VBP doing aux
doing VBG doing ROOT
internship NN doing dobj
in IN internship prep
TCS NNP RIO-125 compound
RIO-125 NNP in pobj
with IN doing prep
the DT detection det
project NN detection compound
Automate NNP detection compound
detection NN with pobj
and CC detection cc
recognition NN detection conj
of IN detection prep
Grammatical JJ Errors amod
Errors NNS of pobj
. . doing punct

Displaying Noun Phrases
I nsubj am
Yokesh attr am
, completed Bachelors degree attr am
Engineering pobj in
8.46 CGPA pobj with
I nsubj doing
internship dob

In [15]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string
punctuations = string.punctuation
!python3 -m spacy download en
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()

#Custom transformer using spacy
class predictors(TransformerMixin):
  def transform(self, X, **transform_params):
    return [clean_text(text) for text in X]
  def fit(self, X, y=None, **fit_params):
    return self
  def get_params(self, deep=True):
    return {}

#Basic Utility function to clean the text

def clean_text(text):
  return text.strip().lower()

def spacy_tokenizer(sentence):
  tokens = nlp(sentence)
  tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
  tokens = [tok for tok in tokens if (tok not in sklearn_stop_words and tok not in punctuations)]
  return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

#create the pipeline to clean, tokenize, vectorize and classify
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])




[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer


In [17]:
base_classifiers = [
    ('knn', KNeighborsClassifier()),
    ('tree', DecisionTreeClassifier()),
    ('nb' , GaussianNB()),
   ('KNC' , KNeighborsClassifier()),
    ('xgb' , GradientBoostingClassifier()),
    ('svc',LinearSVC())
]
from sklearn.ensemble import VotingClassifier
voting_classifier = VotingClassifier(base_classifiers)

pipe2 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('dense_transformer', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
                 ('classifier',voting_classifier)
])



In [18]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("gdrive/My Drive/Colab Notebooks/All_train_data.csv")

In [19]:
df_train.head()

Unnamed: 0,input,labels
0,I am reading score of Mahler is Symphony No . .,0
1,I am not interested in cars or electric applia...,1
2,This is my homework for my English class .,0
3,"In comparison , Canada is catches increased an...",0
4,"Fortunately , my older sister is friend is a d...",1


In [20]:
df_train.loc[df_train['labels']==0, 'labels']='neg'

In [21]:
df_train.loc[df_train['labels']==1, 'labels']='pos'

In [22]:
train=[(df_train.iloc[i][0],df_train.iloc[i][1]) for i in range(len(df_train))]

In [23]:
train

[('I am reading score of Mahler is Symphony No . .', 'neg'),
 ('I am not interested in cars or electric appliances .', 'pos'),
 ('This is my homework for my English class .', 'neg'),
 ('In comparison , Canada is catches increased and decreased at steadier rate , although similarly to US .',
  'neg'),
 ('Fortunately , my older sister is friend is a dentist .', 'pos'),
 ('Also , some recycling methods for plastics use more oil to use machinery than making new one from oil .',
  'pos'),
 ('Comparison Contrast', 'pos'),
 ('See you .', 'neg'),
 ('Talking with Kyouke from about to .', 'pos'),
 ('biodiversity n .', 'neg'),
 ('I understand it is a dream .', 'pos'),
 ('I request many things to her so often .', 'neg'),
 ('Working - Holiday is a system for youngmen to taravel abroad with working in some country .',
  'neg'),
 ('Nice to meet you .', 'neg'),
 ('Is not that easier than the Taiwanese one ?', 'pos'),
 ('I could not write diary for long time .', 'neg'),
 ('And my ankle and knee are in 

In [24]:
df_test = pd.read_csv("gdrive/My Drive/Colab Notebooks/val_data.csv")


In [25]:
df_test.loc[df_test['labels']==0, 'labels']='neg'

In [26]:
df_test.loc[df_test['labels']==1, 'labels']='pos'

In [27]:
test = [(df_test.iloc[i][0],df_test.iloc[i][1]) for i in range(len(df_test))]

In [28]:
test

[('It was the same thing .', 'neg'),
 ('I can study idioms a lot ', 'pos'),
 ('I just bet some coins for numbers or colors .', 'neg'),
 ('Yesterday , I was checking some e - mails on business matters with my own laptop at home since I was commanded to wait at home by my boss .',
  'pos'),
 ('I regret that I have not played the piano very much for the last weeks .',
  'pos'),
 ('I will try to write a diary everyday to record things happened in my daily life and share it with net friend here .',
  'neg'),
 ('A TV performer said on a TV program before that when he wore a T - shirt which was a heavy metal band is on his trip to Australia , he was surrounded by men who looked very strong .',
  'pos'),
 ('She is Five years old .', 'neg'),
 ('A silicon steamer , which a friend of my ex - colleagues gave me as a wedding gift , will help me to cook vegetables .',
  'pos'),
 ('I do not imagine whether my life is long or short .', 'pos'),
 ('What a not grad it is !', 'neg'),
 ('Now that I had bre

In [29]:
pipe2.fit([x[0] for x in train], [x[1] for x in train])
pred_data = pipe2.predict([x[0] for x in test])
for (sample, pred) in zip(test, pred_data):
  print(sample, pred )
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('so i want you to check my draft , ', 'neg') neg
('Driving Japanese car , I feel it trustworthy .', 'neg') neg
(' - Vacation Most of the learners may have a long vacation within three months and it can be an obstacle from continuing to learn .', 'neg') pos
('Good morning .', 'pos') neg
('You are very good at writing English .', 'neg') pos
('Over the sugar completed !', 'pos') neg
('Recently Japanese prefers to eat light sweet jam .', 'neg') pos
('Find a partner to talk in English .', 'pos') neg
('So , I will be happy if you become my friend .', 'neg') pos
('next year .', 'pos') pos
('We filled the remaining time by going to a museum to appreciate art .', 'neg') pos
('when you will talk to children or friends who are not need to be young', 'neg') pos
('They came my house before I married .', 'neg') pos
('He said that I have a stomachache .', 'pos') pos
('We talked about what we are doing these days and grumbled about our 

In [30]:
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

Accuracy: 0.6299
