In [10]:
import numpy as np
import pandas as pd
import joblib

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

import joblib

In [11]:
df = pd.read_csv('training_set_7_final.txt', sep='\t', error_bad_lines=False, encoding="ISO-8859-1")

In [12]:
def text_to_cleantext_tokenizer(text, remove_nonletters = False, remove_stopwords=False, stemming=False, lemma=False):

    # 1. Remove HTML
    teks = BeautifulSoup(text, 'lxml').get_text()

    # 2. Remove non-ASCII
    letters_only = re.sub(r"[^\x00-\x7F]+", " ", teks)

    if remove_nonletters:
        letters_only = re.sub("[^a-zA-Z]", " ", letters_only)

    #letters_only = teks

    # 3. Convert to lower-case, split into words
    words = letters_only.lower()
    words = word_tokenize(words)

    # 4. Convert stopwords into Set (faster than List)
    # 5. Remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 6. Stemming
    if stemming:
        porter = PorterStemmer()
        stems = []
        for t in words:
            stems.append(porter.stem(t))
        words = stems

    # 7. Stemming
    if lemma:
        lemmatizer = WordNetLemmatizer()
        lemmas = []
        for t in words:
            lemmas.append(lemmatizer.lemmatize(t))
        words = lemmas
        
    # 8. Join words back into one string by space, and return the result
    return(" ".join(words))
    # 8. Return list of words
    # return(words)
    

In [13]:
df['essay'][0]

'Patience is when your waiting .I was patience when in line waiting for lunch .I didn\x92t c ut any one to eat .I was standing and waiting for my turn .Patience ,some people don\x92t have it .Lots of people just cut or yell at you because they don\x92t have  any patience. Sometimes people will push you out of their way .They only do that because they don\x92t have patience at all. Patience is what people need .People need patience because lots o f feelings get hurt .Everyone should have patience.'

In [14]:
for idx, row in df.iterrows():  
    line = df.at[idx, 'essay']
    line = line.encode('ascii','ignore')
    line = text_to_cleantext_tokenizer(line, remove_nonletters=True, remove_stopwords=False, lemma=True)
    df.at[idx,'essay'] = line

In [15]:
df['essay'][0]

'patience is when your waiting i wa patience when in line waiting for lunch i didnt c ut any one to eat i wa standing and waiting for my turn patience some people dont have it lot of people just cut or yell at you because they dont have any patience sometimes people will push you out of their way they only do that because they dont have patience at all patience is what people need people need patience because lot o f feeling get hurt everyone should have patience'

In [16]:
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,2))
X = vectorizer.fit_transform(df['essay'])

In [17]:
len(vectorizer.get_feature_names_out())

14974

In [21]:
vectorizer.get_feature_names_out()

array(['ability', 'able', 'able get', ..., 'zoo', 'zoom', 'zoomed'],
      dtype=object)

In [22]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
cosine_similarity(X[625],X[880])

array([[0.31184505]])

In [24]:
cosine_similarity(X[59],X[1064])

array([[0.05605519]])

In [25]:
cosine_similarity(X[756],X[270])

array([[0.74117431]])

In [26]:
cosine_similarity(X[262],X[466])

array([[0.]])

In [27]:
joblib.dump(X.toarray(), 'essay_ngram_no_stopword_asap7')

['essay_ngram_no_stopword_asap7']

In [35]:
vect = CountVectorizer(ngram_range=(1,2))
X = vect.fit_transform(['Patience is important'])

In [36]:
X

<1x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [37]:
vect.vocabulary_

{'patience': 3, 'is': 1, 'important': 0, 'patience is': 4, 'is important': 2}