In [15]:
import numpy as np
import pandas as pd
import joblib

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

import joblib

In [22]:
df = pd.read_csv('asap7_paraphrased.txt', sep=" \t")

  return func(*args, **kwargs)


In [23]:
df

Unnamed: 0,essay
0,Tolerance is the point at which your pausing ....
1,"I'm not a tolerance individual, similar to I c..."
2,One day I was at b-ball practice and I was run...
3,I going to expound on when I went to the @ORGA...
4,It very well may be exceptionally difficult fo...
...,...
1564,Once I was getting a cool @CAPS1 game it was s...
1565,A patent individual in my life is my mother. A...
1566,At the point when another person I know showed...
1567,I disdain weddings. I love when individuals ge...


In [24]:
def text_to_cleantext_tokenizer(text, remove_nonletters = False, remove_stopwords=False, stemming=False, lemma=False):

    # 1. Remove HTML
    teks = BeautifulSoup(text, 'lxml').get_text()

    # 2. Remove non-ASCII
    letters_only = re.sub(r"[^\x00-\x7F]+", " ", teks)

    if remove_nonletters:
        letters_only = re.sub("[^a-zA-Z]", " ", letters_only)

    #letters_only = teks

    # 3. Convert to lower-case, split into words
    words = letters_only.lower()
    words = word_tokenize(words)

    # 4. Convert stopwords into Set (faster than List)
    # 5. Remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 6. Stemming
    if stemming:
        porter = PorterStemmer()
        stems = []
        for t in words:
            stems.append(porter.stem(t))
        words = stems

    # 7. Stemming
    if lemma:
        lemmatizer = WordNetLemmatizer()
        lemmas = []
        for t in words:
            lemmas.append(lemmatizer.lemmatize(t))
        words = lemmas
        
    # 8. Join words back into one string by space, and return the result
    return(" ".join(words))
    # 8. Return list of words
    # return(words)
    

In [25]:
df['essay'][0]

'Tolerance is the point at which your pausing .I was persistence when in line sitting tight for lunch .I didnæt c ut any one to eat .I was standing and sitting tight for my turn .Patience ,a few group donæt have it .Lots of individuals just cut or shout at you since they donæt have any persistence. Once in a while individuals will push you out of their way .They just do that since they donæt have tolerance by any stretch of the imagination. Persistence is the thing that individuals need .People need tolerance since parcels o f emotions get injured .Everyone ought to have tolerance.'

In [26]:
for idx, row in df.iterrows():  
    line = df.at[idx, 'essay']
    line = line.encode('ascii','ignore')
    line = text_to_cleantext_tokenizer(line, remove_nonletters=True, remove_stopwords=False, lemma=True)
    df.at[idx,'essay'] = line

In [27]:
df['essay'][0]

'tolerance is the point at which your pausing i wa persistence when in line sitting tight for lunch i didnt c ut any one to eat i wa standing and sitting tight for my turn patience a few group dont have it lot of individual just cut or shout at you since they dont have any persistence once in a while individual will push you out of their way they just do that since they dont have tolerance by any stretch of the imagination persistence is the thing that individual need people need tolerance since parcel o f emotion get injured everyone ought to have tolerance'

In [28]:
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,2))
X = vectorizer.fit_transform(df['essay'])

In [29]:
len(vectorizer.get_feature_names_out())

16139

In [30]:
vectorizer.get_feature_names_out()

array(['ability', 'ability that', 'ability to', ..., 'zoo', 'zoom',
       'zoomed'], dtype=object)

In [31]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [32]:
cosine_similarity(X[625],X[880])

array([[0.45285465]])

In [33]:
cosine_similarity(X[59],X[1064])

array([[0.01365418]])

In [34]:
cosine_similarity(X[756],X[270])

array([[0.58267886]])

In [35]:
cosine_similarity(X[262],X[466])

array([[0.26492824]])

In [36]:
joblib.dump(X.toarray(), 'essay_ngram_paraphrase_asap7')

['essay_ngram_paraphrase_asap7']

In [35]:
vect = CountVectorizer(ngram_range=(1,2))
X = vect.fit_transform(['Patience is important'])

In [36]:
X

<1x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [37]:
vect.vocabulary_

{'patience': 3, 'is': 1, 'important': 0, 'patience is': 4, 'is important': 2}