In [3]:
import pandas as pd
import string
import numpy as np
import textstat as txst
from sklearn.model_selection import train_test_split
from sklearn import metrics
import nltk
from nltk.corpus import stopwords
import statsmodels.api as sm

train_df = pd.read_csv('data/train.csv', encoding = 'latin-1')

train_df['rd_flesch_ease'] = train_df.excerpt.apply(txst.flesch_reading_ease)
train_df['rd_dalechall'] = train_df.excerpt.apply(txst.dale_chall_readability_score)
train_df['rd_colemanliau'] = train_df.excerpt.apply(txst.coleman_liau_index)

# Cleaning
train_df['excerpt'] = train_df['excerpt'].str.lower()#Remove punctuation
table = str.maketrans('', '', string.punctuation)
train_df['excerpt'] = [train_df['excerpt'][row].translate(table) for row in range(len(train_df['excerpt']))]#Remove stopwords

nltk.download('stopwords')

stop = stopwords.words('english')
train_df['excerpt'] = train_df['excerpt'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

[nltk_data] Downloading package stopwords to /home/edu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(norm="l2")
processed_text = vectorizer.fit_transform(train_df['excerpt']).toarray()

train_df['processed_text'] = [sum(arr) for arr in processed_text]

In [37]:
feature_list = ['processed_text', 'standard_error', 'rd_flesch_ease', 'rd_dalechall', 'rd_colemanliau']
data = np.array(train_df[feature_list])
exog = sm.add_constant(np.array(train_df[feature_list]))
endog = np.array(train_df['target'])
glmGamma = sm.GLM(endog, exog, family=sm.families.Gaussian()).fit()
print(feature_list)
print(glmGamma.summary())
y_pred = glmGamma.predict(exog)
print(y_pred)
print(endog)

['processed_text', 'standard_error', 'rd_flesch_ease', 'rd_dalechall', 'rd_colemanliau']
                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                 2834
Model:                            GLM   Df Residuals:                     2828
Model Family:                Gaussian   Df Model:                            5
Link Function:               identity   Scale:                         0.68081
Method:                          IRLS   Log-Likelihood:                -3473.5
Date:                Mon, 14 Jun 2021   Deviance:                       1925.3
Time:                        21:38:14   Pearson chi2:                 1.93e+03
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------