In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import pearsonr
from scipy.stats import f_oneway
from plotly.offline import iplot
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [3]:
df = pd.read_csv('batighor_books_data_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,Title,Author,Total Ratings,Original Price,Discounted Price,Category,Number of Reviews
0,কসমোজাহি,মোহাম্মদ নাজিম উদ্দিন,9.0,380.0,298.0,থ্রিলার,5.0
1,কন্ট্রোল,মোহাম্মদ নাজিম উদ্দিন,11.0,650.0,510.0,রহস্য ও গোয়েন্দা,7.0
2,ম্যালিস,কেইগো হিগাশিনো,3.0,400.0,314.0,"রহস্য, গোয়েন্দা, ভৌতিক, মিথ, থ্রিলার, ও অ্যাডভ...",4.0
3,উজিরে আজম,হাসান ইনাম,5.0,220.0,172.0,থ্রিলার,1.0
4,মৃত কৈটভ,সৌরভ চক্রবর্তী,,350.0,274.0,অতিপ্রাকৃত ও ভৌতিক,


In [5]:
df['Number of Reviews'] = df['Number of Reviews'].replace('No', np.nan)

In [6]:
df['Number of Reviews'] = pd.to_numeric(df['Number of Reviews'], errors='coerce')

df['Discount Percentage'] = (df['Original Price'] - df['Discounted Price']) / df['Original Price'] * 100

# 1. Author Popularity
author_popularity = df.groupby('Author')['Total Ratings'].sum() / df.groupby('Author')['Title'].count()
df['Author Popularity'] = df['Author'].map(author_popularity)

# 2. Category Popularity Index
category_popularity_index = df.groupby('Category')['Total Ratings'].sum() / df.groupby('Category')['Title'].count()
df['Category Popularity Index'] = df['Category'].map(category_popularity_index)

# 3. Discount Effectiveness
df['Discount Effectiveness'] = df['Total Ratings'] / (df['Discount Percentage'] + 1)

# 4. Author Book Count
author_book_count = df['Author'].value_counts()
df['Author Book Count'] = df['Author'].map(author_book_count)

In [7]:
# Reorder the DataFrame columns
df = df[[
    'Title', 'Author', 'Category', 'Original Price', 'Discounted Price', 
    'Discount Percentage', 'Total Ratings', 'Number of Reviews', 
    'Author Popularity', 'Author Book Count', 'Category Popularity Index', 
    'Discount Effectiveness'
]]

In [8]:
df.head()

Unnamed: 0,Title,Author,Category,Original Price,Discounted Price,Discount Percentage,Total Ratings,Number of Reviews,Author Popularity,Author Book Count,Category Popularity Index,Discount Effectiveness
0,কসমোজাহি,মোহাম্মদ নাজিম উদ্দিন,থ্রিলার,380.0,298.0,21.578947,9.0,5.0,48.133333,45,28.109091,0.398601
1,কন্ট্রোল,মোহাম্মদ নাজিম উদ্দিন,রহস্য ও গোয়েন্দা,650.0,510.0,21.538462,11.0,7.0,48.133333,45,25.642857,0.488055
2,ম্যালিস,কেইগো হিগাশিনো,"রহস্য, গোয়েন্দা, ভৌতিক, মিথ, থ্রিলার, ও অ্যাডভ...",400.0,314.0,21.5,3.0,4.0,18.0,2,34.459459,0.133333
3,উজিরে আজম,হাসান ইনাম,থ্রিলার,220.0,172.0,21.818182,5.0,1.0,12.0,4,28.109091,0.219124
4,মৃত কৈটভ,সৌরভ চক্রবর্তী,অতিপ্রাকৃত ও ভৌতিক,350.0,274.0,21.714286,,,14.2,5,8.454545,


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Title                      342 non-null    object 
 1   Author                     342 non-null    object 
 2   Category                   342 non-null    object 
 3   Original Price             342 non-null    float64
 4   Discounted Price           342 non-null    float64
 5   Discount Percentage        342 non-null    float64
 6   Total Ratings              298 non-null    float64
 7   Number of Reviews          284 non-null    float64
 8   Author Popularity          342 non-null    float64
 9   Author Book Count          342 non-null    int64  
 10  Category Popularity Index  342 non-null    float64
 11  Discount Effectiveness     298 non-null    float64
dtypes: float64(8), int64(1), object(3)
memory usage: 32.2+ KB


In [10]:
df.isnull().sum()

Title                         0
Author                        0
Category                      0
Original Price                0
Discounted Price              0
Discount Percentage           0
Total Ratings                44
Number of Reviews            58
Author Popularity             0
Author Book Count             0
Category Popularity Index     0
Discount Effectiveness       44
dtype: int64

In [11]:
df['Total Ratings'] = df.groupby('Category')['Total Ratings'].transform(lambda x: x.fillna(x.mean()))

In [12]:
feature = ['Number of Reviews']

# Apply KNN imputation
imputer = KNNImputer(n_neighbors=5)
df[feature] = imputer.fit_transform(df[feature])

In [13]:
imputer = SimpleImputer(strategy='mean')
df['Discount Effectiveness'] = imputer.fit_transform(df[['Discount Effectiveness']])

In [14]:
df.isnull().sum()

Title                        0
Author                       0
Category                     0
Original Price               0
Discounted Price             0
Discount Percentage          0
Total Ratings                5
Number of Reviews            0
Author Popularity            0
Author Book Count            0
Category Popularity Index    0
Discount Effectiveness       0
dtype: int64

In [15]:
df.dropna(axis=0, inplace=True)

In [16]:
df.isnull().sum()

Title                        0
Author                       0
Category                     0
Original Price               0
Discounted Price             0
Discount Percentage          0
Total Ratings                0
Number of Reviews            0
Author Popularity            0
Author Book Count            0
Category Popularity Index    0
Discount Effectiveness       0
dtype: int64

In [17]:
df.shape

(337, 12)

In [18]:
# Which book categories have the highest and lowest average ratings?
category_avg_ratings = df.groupby('Category')['Total Ratings'].mean().sort_values(ascending=False)
highest_avg_ratings_category = category_avg_ratings.idxmax()
lowest_avg_ratings_category = category_avg_ratings.idxmin()

# Distribution of books across different categories
category_counts = df['Category'].value_counts()

# Analysis of discounts across categories
category_discount_stats = df.groupby('Category')['Discount Percentage'].describe()

In [19]:
highest_avg_ratings_category

'রহস্য,গোয়েন্দা,ভৌতিক ও থ্রিলার কালেকশন'

In [20]:
lowest_avg_ratings_category

'বাংলা কবিতা'

In [21]:
print(category_counts)

Category
থ্রিলার                                                                  110
রহস্য, গোয়েন্দা, ভৌতিক, মিথ, থ্রিলার, ও অ্যাডভেঞ্চার: অনুবাদ ও ইংরেজি     74
সমকালীন উপন্যাস                                                           29
অনুবাদ উপন্যাস                                                            16
রহস্য ও গোয়েন্দা                                                          14
থ্রিলার ও অ্যাডভেঞ্চার উপন্যাস                                            14
সায়েন্স ফিকশন                                                             13
অতিপ্রাকৃত ও ভৌতিক                                                        11
বাংলা কবিতা                                                                9
শিশু-কিশোর: রহস্য, গোয়েন্দা, ভৌতিক, থ্রিলার ও অ্যাডভেঞ্চার                 6
রহস্য,গোয়েন্দা,ভৌতিক ও থ্রিলার কালেকশন                                     5
প্যারাসাইকোলজিকাল উপন্যাস                                                  4
অনুবাদ সায়েন্স ফিকশন                                               

In [22]:
category_discount_stats

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
অতিপ্রাকৃত ও ভৌতিক,11.0,22.345376,2.546483,21.071429,21.55,21.666667,21.690476,30.0
অতিপ্রাকৃত ও ভৌতিক উপন্যাস,3.0,22.722222,1.974373,21.5,21.583333,21.666667,23.333333,25.0
অনুবাদ উপন্যাস,16.0,25.807465,4.330714,21.470588,21.644737,25.909091,30.0,30.0
অনুবাদ গল্প,2.0,21.742424,0.107137,21.666667,21.704545,21.742424,21.780303,21.818182
অনুবাদ সায়েন্স ফিকশন,4.0,21.599009,0.052631,21.538462,21.577797,21.595455,21.616667,21.666667
জীবনী ও স্মৃতিচারণ: বিবিধ,2.0,21.633333,0.04714,21.6,21.616667,21.633333,21.65,21.666667
থ্রিলার,110.0,23.634658,3.584615,21.333333,21.538462,21.6,23.983957,30.0
থ্রিলার ও অ্যাডভেঞ্চার উপন্যাস,14.0,28.181319,3.614573,21.333333,30.0,30.0,30.0,30.0
থ্রিলার ও অ্যাডভেঞ্চার গল্প,1.0,26.0,,26.0,26.0,26.0,26.0,26.0
পেশাগত স্মৃতিচারণ ও অভিজ্ঞতা,1.0,25.0,,25.0,25.0,25.0,25.0,25.0


In [23]:
# Top authors based on total ratings and number of reviews
top_authors_by_ratings = df.groupby('Author')['Total Ratings'].sum().sort_values(ascending=False).head()
top_authors_by_reviews = df.groupby('Author')['Number of Reviews'].sum().sort_values(ascending=False).head()

# Correlation between author popularity and ratings/reviews
author_popularity_ratings_corr = df[['Author Popularity', 'Total Ratings']].corr()
author_popularity_reviews_corr = df[['Author Popularity', 'Number of Reviews']].corr()

# Authors with highest discount effectiveness
top_authors_by_discount_effectiveness = df.groupby('Author')['Discount Effectiveness'].mean().sort_values(ascending=False).head()

In [24]:
print(top_authors_by_ratings)
print(top_authors_by_reviews)
print(author_popularity_ratings_corr)
print(author_popularity_reviews_corr)

Author
মোহাম্মদ নাজিম উদ্দিন    2166.0
নিক পিরোগ                1014.0
ড্যান ব্রাউন              780.0
কিশোর পাশা ইমন            302.4
মাশুদুল হক                225.0
Name: Total Ratings, dtype: float64
Author
মোহাম্মদ নাজিম উদ্দিন    1826.647887
নিক পিরোগ                1237.000000
মারিও পূজো                498.000000
ড্যান ব্রাউন              392.000000
মাশুদুল হক                202.000000
Name: Number of Reviews, dtype: float64
                   Author Popularity  Total Ratings
Author Popularity           1.000000       0.651673
Total Ratings               0.651673       1.000000
                   Author Popularity  Number of Reviews
Author Popularity           1.000000           0.499054
Number of Reviews           0.499054           1.000000


In [25]:
# How discount percentage affects reviews or ratings
discount_reviews_corr = df[['Discount Percentage', 'Number of Reviews']].corr()
discount_ratings_corr = df[['Discount Percentage', 'Total Ratings']].corr()

# Trends in pricing strategies
category_price_stats = df.groupby('Category')[['Original Price', 'Discounted Price']].mean()

# Overall discount effectiveness across categories
overall_discount_effectiveness = df['Discount Effectiveness'].mean()

In [26]:
print(discount_reviews_corr)
print(discount_reviews_corr)
print(category_price_stats)
print(overall_discount_effectiveness)

                     Discount Percentage  Number of Reviews
Discount Percentage              1.00000           -0.05553
Number of Reviews               -0.05553            1.00000
                     Discount Percentage  Number of Reviews
Discount Percentage              1.00000           -0.05553
Number of Reviews               -0.05553            1.00000
                                                    Original Price  \
Category                                                             
অতিপ্রাকৃত ও ভৌতিক                                      288.181818   
অতিপ্রাকৃত ও ভৌতিক উপন্যাস                              393.333333   
অনুবাদ উপন্যাস                                          298.750000   
অনুবাদ গল্প                                             200.000000   
অনুবাদ সায়েন্স ফিকশন                                    505.000000   
জীবনী ও স্মৃতিচারণ: বিবিধ                               275.000000   
থ্রিলার                                                 341.090909   
থ্রিলার ও 

In [27]:
# Calculate the correlation coefficient
corr_coef, _ = pearsonr(df['Author Popularity'], df['Number of Reviews'])
print(f'Correlation Coefficient: {corr_coef:.2f}')

# Create scatter plot with trendline
fig = px.scatter(df, x='Author Popularity', y='Number of Reviews', trendline='ols',
                 title='Relationship between Author Popularity and Number of Reviews',
                 labels={'Author Popularity': 'Author Popularity', 'Number of Reviews': 'Number of Reviews'})

# Adjust the figure size
fig.update_layout(
    width=1000,  # Set the width of the figure
    height=600   # Set the height of the figure
)

iplot(fig)

Correlation Coefficient: 0.50


In [28]:
# Calculate the correlation coefficient
corr_coef, _ = pearsonr(df['Discount Percentage'], df['Total Ratings'])
print(f'Correlation Coefficient: {corr_coef:.2f}')

# Create scatter plot with trendline
fig = px.scatter(df, x='Discount Percentage', y='Total Ratings', trendline='ols',
                 title='Correlation between Discount Percentage and Total Ratings',
                 labels={'Discount Percentage': 'Discount Percentage (%)', 'Total Ratings': 'Total Ratings'})

# Adjust the figure size
fig.update_layout(
    width=1000,  # Set the width of the figure
    height=600   # Set the height of the figure
)

iplot(fig)

Correlation Coefficient: -0.20


In [29]:
# Calculate review-to-price ratio
df['Review to Price Ratio'] = df['Number of Reviews'] / df['Original Price']

# Verify the actual column name for rating and use it
rating_column = 'Total Ratings'  # Replace with the correct column name from your dataset

# Calculate correlation coefficient between review-to-price ratio and rating
corr_coef, _ = pearsonr(df['Review to Price Ratio'], df[rating_column])
print(f'Correlation Coefficient: {corr_coef:.2f}')

# Create scatter plot
fig = px.scatter(df, x='Review to Price Ratio', y=rating_column, trendline='ols',
                 title='Correlation between Review-to-Price Ratio and Rating',
                 labels={'Review to Price Ratio': 'Review-to-Price Ratio', rating_column: 'Rating'})

iplot(fig)

Correlation Coefficient: 0.69


In [30]:
# Group by 'Category' and calculate average discount percentage
avg_discount_by_category = df.groupby('Category')['Discount Percentage'].mean().reset_index()

# Sort to find categories with highest average discount percentages
highest_avg_discount_categories = avg_discount_by_category.sort_values(by='Discount Percentage', ascending=False)

# Plotting using Plotly
fig = px.bar(highest_avg_discount_categories, x='Category', y='Discount Percentage',
             title='Average Discount Percentage by Category',
             labels={'Category': 'Category', 'Discount Percentage': 'Average Discount Percentage'})

# Adjust the figure size
fig.update_layout(
    width=800,  # Set the width of the figure
    height=800   # Set the height of the figure
)
iplot(fig)

In [31]:
# Group by 'Author' and calculate average total ratings and count of books
avg_ratings_by_author = df.groupby('Author').agg({
    'Total Ratings': 'mean',  # Replace 'Total Ratings' with the actual column name for average total ratings
    'Title': 'count'          # Assuming 'Title' column represents the number of books by each author
}).reset_index()

avg_ratings_by_author = avg_ratings_by_author.rename(columns={'Title': 'Book Count'})

# Plotting the relationship
fig = px.scatter(avg_ratings_by_author, x='Book Count', y='Total Ratings',
                 title='Average Total Ratings vs. Author\'s Book Count',
                 labels={'Book Count': 'Author\'s Book Count', 'Total Ratings': 'Average Total Ratings'})

iplot(fig)

In [32]:
# Create scatter plot using Plotly Express
fig = px.scatter(df, x='Original Price', y='Number of Reviews', 
                 title='Relationship between Original Price and Number of Reviews',
                 labels={'Original Price': 'Original Price', 'Number of Reviews': 'Number of Reviews'})

# Calculate correlation coefficient
corr_coef = np.corrcoef(df['Original Price'], df['Number of Reviews'])[0, 1]
print(f'Correlation Coefficient: {corr_coef:.2f}')

# Show the plot
iplot(fig)

Correlation Coefficient: 0.57


In [33]:
# Create box plot with Plotly Express
fig = px.box(df, x='Author Popularity', y='Review to Price Ratio',
             points="all",  # Show all data points
             title='Review-to-Price Ratio Comparison between Popular and Less Popular Authors',
             labels={'Author Popularity': 'Author Popularity', 'Review to Price Ratio': 'Review to Price Ratio'})

fig.update_layout(xaxis_title='Author Popularity', yaxis_title='Review to Price Ratio')

# Show the plot
iplot(fig)

In [34]:
# Calculate average rating by category
avg_rating_by_category = df.groupby('Category')['Total Ratings'].mean().reset_index()

# Sort by average rating (optional)
avg_rating_by_category = avg_rating_by_category.sort_values(by='Total Ratings', ascending=False)

# Create bar chart with Plotly Express
fig = px.bar(avg_rating_by_category, x='Category', y='Total Ratings',
             title='Average Rating by Category',
             labels={'Category': 'Category', 'Total Ratings': 'Average Rating'})

fig.update_layout(xaxis_title='Category', yaxis_title='Average Rating')

# Show the plot
iplot(fig)

In [35]:
# Question 1: Top-selling categories
category_counts = df['Category'].value_counts()
fig = px.bar(category_counts, x=category_counts.index, y=category_counts.values,
              labels={'x': 'Category', 'y': 'Count'},
              title='Distribution of Books by Category')
iplot(fig)

In [36]:
# Question 2: Correlation between ratings, reviews, and review-to-price ratio
fig = px.scatter(df, x='Total Ratings', y='Number of Reviews', size='Review to Price Ratio',trendline='ols',
                  labels={'Total Ratings': 'Total Ratings', 'Number of Reviews': 'Number of Reviews', 'Review to Price Ratio': 'Review to Price Ratio'},
                  title='Correlation between Ratings, Reviews, and Review-to-Price Ratio')
iplot(fig)

In [37]:
# Question 3: Optimal price ranges for different categories
fig = px.histogram(df, x='Original Price', color='Category',
                    labels={'Original Price': 'Original Price', 'Category': 'Category'},
                    title='Distribution of Original Prices by Category')
iplot(fig)





In [38]:
# Question 4: Impact of book title word count (assuming it's available) on engagement metrics
df['Title Word Count'] = df['Title'].str.split().str.len()
fig = px.scatter(df, x='Title Word Count', y='Number of Reviews',
                  labels={'Title Word Count': 'Title Word Count', 'Number of Reviews': 'Number of Reviews'},
                  title='Impact of Title Word Count on Number of Reviews')
iplot(fig)

In [39]:
# Question 5: Books with the best review-to-price ratio
fig = px.scatter(df, x='Review to Price Ratio', y='Number of Reviews', trendline='ols',
                  labels={'Review to Price Ratio': 'Review to Price Ratio', 'Number of Reviews': 'Number of Reviews'},
                  title='Review-to-Price Ratio vs Number of Reviews')
iplot(fig)

In [40]:
# Question 6: Influence of customer demographics (assuming 'Gender' as demographic) on book preferences
fig = px.histogram(df, x='Category', color='Author',
                    labels={'Category': 'Category', 'Gender': 'Gender'},
                    title='Distribution of Book Preferences by Author Book Count')
iplot(fig)





In [41]:
# Question 7: Detecting outliers in data 
fig = go.Figure()
fig.add_trace(go.Box(y=df['Original Price'], name='Original Price'))
fig.update_layout(title='Distribution of Original Prices (Box Plot)')
iplot(fig)

In [42]:
fig = px.scatter(df, x='Discount Effectiveness', y='Number of Reviews', trendline='ols',
                 labels={'Discount Effectiveness': 'Discount Effectiveness', 'Number of Reviews': 'Number of Reviews'},
                 title='Discount Effectiveness vs Number of Reviews')
iplot(fig)

In [43]:
fig = px.bar(df, x='Category', y='Category Popularity Index',
             labels={'Category': 'Category', 'Category Popularity Index': 'Popularity Index'},
             title='Category Popularity Index')
iplot(fig)

In [44]:
corr_data = df.select_dtypes(include='number').corr()
# Create heatmap trace
heatmap_trace = go.Heatmap(
                        x=corr_data.columns,
                        y=corr_data.index,
                        z=corr_data.values,
                        colorscale='Viridis'
                        )

# Create layout
layout = go.Layout(title='Correlation Plot')

# Create figure
fig = go.Figure(data=[heatmap_trace], layout=layout)

# Add annotations with correlation coefficients
for i in range(len(corr_data)):
    for j in range(len(corr_data)):
        fig.add_annotation(x=corr_data.columns[i], y=corr_data.index[j],
                        text=str(round(corr_data.iloc[j, i], 2)),
                        showarrow=False)
# Adjust the figure size
fig.update_layout(
    width=800,  # Set the width of the figure
    height=600   # Set the height of the figure
)
iplot(fig)