In [3]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
data = pd.read_csv('../data/data_features.csv')

## Vectorization (TF-IDF vectorization)

In [5]:
punctuation = '!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~'

def remove_punctuation(text):
    no_punct=[words for words in text if words not in punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

data['full_text_wo_punct'] = data['full_text'].apply(remove_punctuation)

In [6]:
data['full_text_wo_punct'] = data['full_text_wo_punct'].str.lower()

In [14]:
tfidf_vect = TfidfVectorizer(ngram_range=(2,4), max_df = 0.8, min_df = 50)

tfidf_matrix = tfidf_vect.fit_transform(data['full_text_wo_punct'])
tfidf_matrix.shape

(3911, 4996)

In [15]:
data.drop(['full_text_wo_punct'], axis=1, inplace=True)

In [16]:
tf_idf_data = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vect.get_feature_names_out())

In [None]:
# tf_idf_data = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vect.get_feature_names_out())

In [17]:
tf_idf_data = pd.concat([data, tf_idf_data], axis=1)

In [18]:
tf_idf_data.shape

(3911, 5016)

In [19]:
tf_idf_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,excl_quest_sign_count,contractions_count,...,your time,your way,your work,yourself and,yourself in,yourself in world,yourself in world that,yourself is,yourself to,yourself you
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,0,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00299B378633,"Dear, Principal If u change the school policy ...",3.0,3.5,3.0,3.0,3.0,2.5,0,21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,0,54,...,0.0,0.0,0.0,0.0,0.138985,0.077777,0.081278,0.0,0.0,0.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
tf_idf_data.to_csv('../data/tf_idf_data.csv', index=False)

In [None]:
def tf_idf_vectorize(data):
    tfidf_vect = TfidfVectorizer(ngram_range=(1,3), max_df = 0.8, min_df = 3)
    tfidf_matrix = tfidf_vect.fit_transform(data['full_text_wo_punct'])

    tf_idf_data = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=tfidf_vect.get_feature_names_out())
    tf_idf_data = pd.merge(data, tf_idf_data, left_index=True, right_index=True)

    return tf_idf_data

## Normalization (Min Max Scaler)

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

columns = ['cohesion', 'syntax', 'vocabulary','phraseology', 
           'grammar', 'conventions', 'excl_quest_sign_count',
           'contractions_count', 'capitalized_mistakes', 
           'word_count', 'sentence_count', 'paragraph_count',
           'avg_word_count_per_paragraph', 'avg_sentence_count_per_paragraph',
           'has_short_paragraphs', 'difficult_word_count',
           'spelling_mistake_count']
 
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[columns])

scaled_data = pd.DataFrame(scaled_data, columns=columns)
scaled_data = pd.merge(data[['text_id', 'full_text', 'paragraphs']], scaled_data, left_index=True, right_index=True)
scaled_data.head()

In [None]:
def min_max_scale(feature_data):
    columns = ['cohesion', 'syntax', 'vocabulary','phraseology', 
            'grammar', 'conventions', 'excl_quest_sign_count',
            'contractions_count', 'capitalized_mistakes', 
            'word_count', 'sentence_count', 'paragraph_count',
            'avg_word_count_per_paragraph', 'avg_sentence_count_per_paragraph',
            'has_short_paragraphs', 'difficult_word_count',
            'spelling_mistake_count']
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(feature_data[columns])

    scaled_data = pd.DataFrame(scaled_data, columns=columns)
    scaled_data = pd.merge(feature_data[['text_id', 'full_text', 'paragraphs']], scaled_data, left_index=True, right_index=True)
    
    return scaled_data