## Predicting Book Ratings on GoodReads using Linear Regression Models

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import scipy.stats as stats

### Dataframe

In [2]:
gr_data = pd.read_csv('./2. Clean_df/gr_data_CLEAN.csv')
gr_data = gr_data.drop(['Unnamed: 0'],axis=1)

In [3]:
display(gr_data.head(), gr_data.shape)

Unnamed: 0,title,series,author,description,language,pages,isbn,book_format,publish_year,first_publish_year,...,West Australian Young Readers' Book Award (WAYRBA) for Older Readers,West Australian Young Readers' Book Award (WAYRBA) for Younger Readers,William Allen White Children's Book Award,William C. Morris YA Debut Award Nominee,Women's Prize for Fiction Nominee,Women's Prize for Fiction Nominee for Longlist,World Fantasy Award Nominee for Best Novel,World Fantasy Award for Best Novel,Zilveren Griffel,الجائزة العالمية للرواية العربية (أي باف) / International Prize for Arabic Fiction (IPAF) Nominee
0,The Hunger Games,The Hunger Games,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,200-300,9780440000000.0,Hardcover,2008,2008,...,1,0,0,0,0,0,0,0,0,0
1,Harry Potter and the Order of the Phoenix,Harry Potter,J.K. Rowling,There is a door at the end of a silent corrido...,English,700-800,9780440000000.0,Paperback,2004,2003,...,0,0,0,0,0,0,0,0,0,0
2,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,English,200-300,10000000000000.0,Paperback,2006,1960,...,0,0,0,0,0,0,0,0,0,0
3,Pride and Prejudice,Single Book,Jane Austen,Alternate cover edition of ISBN 9780679783268S...,English,100-200,10000000000000.0,Paperback,2000,2013,...,0,0,0,0,0,0,0,0,0,0
4,Twilight,The Twilight Saga,Stephenie Meyer,About three things I was absolutely positive.\...,English,400-500,9780320000000.0,Paperback,2006,2005,...,1,0,0,0,0,0,0,0,0,0


(48655, 862)

#### Drop unnecesary columns

In [4]:
gr_data= gr_data.drop(['title','isbn','description'],axis=1)

In [5]:
gr_data.shape

(48655, 859)

#### NaN

In [6]:
gr_data.isna().sum().sum()

0

#### Types

In [7]:
gr_data.iloc[:, :18].dtypes

series                  object
author                  object
language                object
pages                   object
book_format             object
publish_year             int64
first_publish_year       int64
publisher               object
liked_perc             float64
bbe_score              float64
bbe_votes              float64
log_num_ratings        float64
5_stars_num_ratings      int64
4_stars_num_ratings      int64
3_stars_num_ratings      int64
2_stars_num_ratings      int64
1_star_num_ratings       int64
rating                 float64
dtype: object

#### Correlation-Association amongst features

In [8]:
# Split numerical-categorical variables
numerical = gr_data.select_dtypes(include=np.number)
categorical = gr_data.select_dtypes(include=['object'])

In [9]:
categorical.shape

(48655, 6)

In [10]:
numerical.shape

(48655, 853)

#### Numerical

Correlation Matrix: (only shown correlations > 0.6 or < -0,6 )

In [11]:
# Correlation matrix for numerical values (854 columns)
corr_matrix = numerical.corr()

# Get the indices of non-NaN values in the correlation matrix
indices = [(row, col) for row in corr_matrix.index for col in corr_matrix.columns if not pd.isna(corr_matrix.loc[row, col])]

# Create a DataFrame with the row name, column name, and correlation number for each
# non-NaN value in the correlation matrix that meets the condition and where row name
# and col name are different
df = pd.DataFrame([(idx[0], idx[1], corr_matrix.loc[idx]) for idx in indices if (corr_matrix.loc[idx] > 0.6 or corr_matrix.loc[idx] < -0.6) and idx[0] != idx[1]], columns=['row name', 'col name', 'corr number'])

# Drop duplicate rows where row name is greater than col name
df = df[df['row name'] < df['col name']]

# Sort the DataFrame by the correlation number in descending order
df = df.sort_values(by='corr number', ascending=False)

# Print the sorted DataFrame
display(df, len(df))


Unnamed: 0,row name,col name,corr number
11,3_stars_num_ratings,4_stars_num_ratings,0.957836
16,2_stars_num_ratings,3_stars_num_ratings,0.956072
0,bbe_score,bbe_votes,0.939596
21,1_star_num_ratings,2_stars_num_ratings,0.933824
6,4_stars_num_ratings,5_stars_num_ratings,0.931859
15,2_stars_num_ratings,4_stars_num_ratings,0.856281
10,3_stars_num_ratings,5_stars_num_ratings,0.83187
20,1_star_num_ratings,3_stars_num_ratings,0.820076
14,2_stars_num_ratings,5_stars_num_ratings,0.726403
19,1_star_num_ratings,4_stars_num_ratings,0.711987


33

In [12]:
display(numerical['Hugo Award for Best Novel '].value_counts(),numerical['hugo awards'].value_counts())

0    48593
1       62
Name: Hugo Award for Best Novel , dtype: int64

0    48620
1       35
Name: hugo awards, dtype: int64

In [13]:
display(numerical['love inspired'].value_counts(),numerical['love inspired historical'].value_counts())

0    48645
1       10
Name: love inspired, dtype: int64

0    48650
1        5
Name: love inspired historical, dtype: int64

In [14]:
# High correlated columns
high_correlated = ['2_stars_num_ratings','3_stars_num_ratings','4_stars_num_ratings',
                   'bbe_votes','hugo awards','booze','love inspired historical']

In [15]:
numerical.shape

(48655, 853)

In [16]:
# Drop high correlated variables
numerical = numerical.drop(high_correlated, axis=1)
numerical.shape

(48655, 846)

In [None]:
# Correlation matrix for numerical values (849 columns)
#corr_matrix = numerical.corr()

# Get the indices of non-NaN values in the correlation matrix
#indices = [(row, col) for row in corr_matrix.index for col in corr_matrix.columns if not pd.isna(corr_matrix.loc[row, col])]

# Create a DataFrame with the row name, column name, and correlation number for each
# non-NaN value in the correlation matrix that meets the condition and where row name
# and col name are different
#df = pd.DataFrame([(idx[0], idx[1], corr_matrix.loc[idx]) for idx in indices if (corr_matrix.loc[idx] > 0.6 or corr_matrix.loc[idx] < -0.6) and idx[0] != idx[1]], columns=['row name', 'col name', 'corr number'])

# Drop duplicate rows where row name is greater than col name
#df = df[df['row name'] < df['col name']]

# Sort the DataFrame by the correlation number in descending order
#df = df.sort_values(by='corr number', ascending=False)

# Print the sorted DataFrame
#display(df, len(df))


correlation matrix can differ depending on whether the data is scaled or not. This is because correlation measures the linear relationship between two variables, and the scale of the variables can affect the strength of their relationship.

For example, if one variable has a much larger scale than another variable, it may dominate the correlation analysis and obscure any relationships between the other variables. Scaling the variables to have the same range of values can help to mitigate this problem and provide a more accurate representation of the correlations between the variables.

generally recommended to scale the numerical features before applying PCA to ensure that they are on a similar scale and have equal weight in the analysis. This is because PCA is sensitive to the scale of the variables, and features with larger scales can dominate the analysis. 

#### Categorical

In [17]:
categorical.shape

(48655, 6)

In [18]:
categorical.columns

Index(['series', 'author', 'language', 'pages', 'book_format', 'publisher'], dtype='object')

In [19]:
for col in categorical.columns:
    print(f"Unique values for {col}: {categorical[col].nunique()}")

Unique values for series: 4405
Unique values for author: 21720
Unique values for language: 92
Unique values for pages: 12
Unique values for book_format: 10
Unique values for publisher: 4


In [20]:
categorical['author']

0        Suzanne Collins
1           J.K. Rowling
2             Harper Lee
3            Jane Austen
4        Stephenie Meyer
              ...       
48650      Sherry Gammon
48651      Emma Michaels
48652     Kim Richardson
48653        Tom Pollack
48654       Misty Moncur
Name: author, Length: 48655, dtype: object

In [23]:
categorical['series']

0             The Hunger Games
1                 Harry Potter
2        To Kill a Mockingbird
3                  Single Book
4            The Twilight Saga
                 ...          
48650                Port Fare
48651           Sense of Truth
48652           Soul Guardians
48653              Single Book
48654              Single Book
Name: series, Length: 48655, dtype: object