In [1]:
import pandas as pd
import numpy as np

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load the data
df = pd.read_csv('rokomari.csv')

# Function to convert price columns to numeric
def convert_to_numeric(x):
    try:
        return pd.to_numeric(x.replace(',', ''), errors='coerce')
    except:
        return np.nan

# Convert price columns to numeric
df['Original Price'] = df['Original Price'].apply(convert_to_numeric)
df['Discounted Price'] = df['Discounted Price'].apply(convert_to_numeric)

# Handle missing values in Rating
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Drop rows with missing values
df = df.dropna()

# Feature Engineering
df['Discount_Amount'] = df['Original Price'] - df['Discounted Price']
df['Discount_Percentage'] = (df['Discount_Amount'] / df['Original Price']) * 100
df['Title_Length'] = df['Title'].apply(len)

# Prepare features and target
numeric_features = ['Original Price', 'Discounted Price', 'Discount_Amount', 'Discount_Percentage', 'Title_Length']
categorical_features = ['Author']

X = df[numeric_features + categorical_features]
y = df['Rating']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', rf)
])

# Cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation MSE scores:", -cv_scores)
print("Average MSE:", -cv_scores.mean())

# Fit the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Predict on entire dataset and rank books
df['Predicted_Rating'] = pipeline.predict(X)
df['Rank'] = df['Predicted_Rating'].rank(method='dense', ascending=False)

# Display top 10 ranked books
print("\nTop 10 Ranked Books:")
print(df.sort_values('Rank')[['Title', 'Author', 'Original Price', 'Discounted Price', 'Rating', 'Predicted_Rating', 'Rank']].head(10))

# Save results
df.to_csv('ranked_books_updated_v3.csv', index=False)
print("\nFull results saved to 'ranked_books_updated_v3.csv'")

Cross-validation MSE scores: [2577.107101     54.28623332   20.25841867  450.32404798  314.14378414]
Average MSE: 683.2239170211864

Model Performance:
Mean Squared Error: 236.6665200034721
R-squared Score: 0.4086879168843066

Top 10 Ranked Books:
                        Title         Author  Original Price  \
0                       দেয়াল  হুমায়ূন আহমেদ             450   
10         জোছনা ও জননীর গল্প  হুমায়ূন আহমেদ             800   
1016  তোমার নামে সন্ধ্যা নামে   সাদাত হোসাইন             600   
21              বাদশাহ নামদার  হুমায়ূন আহমেদ             450   
203            মধ্যাহ্ন অখণ্ড  হুমায়ূন আহমেদ             700   
35                      ফানুস     মৌরি মরিয়ম             600   
13                    হিমু দশ  হুমায়ূন আহমেদ             850   
1142        To The Woods Dark  হুমায়ূন আহমেদ             120   
28                মাতাল হাওয়া  হুমায়ূন আহমেদ             600   
37                   মরণোত্তম   সাদাত হোসাইন             300   

      Discounted Price  Rating  Predicted_Ratin

In [2]:
df = pd.read_csv('rokomari.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'Title', 'Author', 'Original Price', 'Discounted Price',
       'Rating'],
      dtype='object')

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,Title,Author,Original Price,Discounted Price,Rating
0,দেয়াল,হুমায়ূন আহমেদ,450,315,431.0
1,শম্পা কী চায়?,এশরার লতিফ,350,301,11.0
2,ইনসমনিয়া,ফরহাদ হোসেন,320,275,3.0
3,সাতপুরুষে খেদু মিয়া,অসীম হিমেল,580,499,5.0
4,হুমায়ূন আহমেদ রচনাবলী ২,হুমায়ূন আহমেদ,800,688,11.0


In [6]:
grouped = df.groupby('Author')

In [7]:
count = grouped['Title'].value_counts()

In [8]:
count

Author               Title                                   
Dr. Naushaba Singh   Celebrating Relationships                   1
অধ্যাপক আতাউর রহমান  দুই দু’গুণে পাঁচ                            1
                     মধ্যরাতের জোকস্‌                            1
                     রসগল্পের রাজকোষ                             1
অধ্যাপক আবু সাইয়িদ   বাংলাদেশের স্বাধীনতা যুদ্ধের আড়ালে যুদ্ধ    1
                                                                ..
হুমায়ূন আহমেদ        ৯০ দশকের ৫টি উপন্যাস                        1
হুসেইন ফজলুল বারী    রানি দ্বিতীয় এলিজাবেথ ও মায়াবী দাঁড়কাক      1
হো চি মিন            কারাগারের কাব্য                             1
হোসেন আবদুল মান্নান  কেবলই পিছনে তাকাই                           1
                     বেদনার জলছবি                                1
Name: count, Length: 1354, dtype: int64

In [9]:
humayun = df[df['Author'] == 'হুমায়ূন আহমেদ']

In [12]:
df['Author'].nunique()

476

In [14]:

# Function to convert price columns to numeric
def convert_to_numeric(x):
    try:
        return pd.to_numeric(x.replace(',', ''), errors='coerce')
    except:
        return np.nan

# Convert price columns to numeric
df['Original Price'] = df['Original Price'].apply(convert_to_numeric)
df['Discounted Price'] = df['Discounted Price'].apply(convert_to_numeric)

# Handle missing values in Rating
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

In [15]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df[['Rating']] = imputer.fit_transform(df[['Rating']])

In [16]:
df.head()

Unnamed: 0,Title,Author,Original Price,Discounted Price,Rating
0,দেয়াল,হুমায়ূন আহমেদ,450,315,431.0
1,শম্পা কী চায়?,এশরার লতিফ,350,301,11.0
2,ইনসমনিয়া,ফরহাদ হোসেন,320,275,3.0
3,সাতপুরুষে খেদু মিয়া,অসীম হিমেল,580,499,5.0
4,হুমায়ূন আহমেদ রচনাবলী ২,হুমায়ূন আহমেদ,800,688,11.0


In [17]:
# Feature Engineering
df['Discount_Amount'] = df['Original Price'] - df['Discounted Price']
df['Discount_Percentage'] = (df['Discount_Amount'] / df['Original Price']) * 100
df['Title_Length'] = df['Title'].apply(len)

In [18]:
df.head()

Unnamed: 0,Title,Author,Original Price,Discounted Price,Rating,Discount_Amount,Discount_Percentage,Title_Length
0,দেয়াল,হুমায়ূন আহমেদ,450,315,431.0,135,30.0,5
1,শম্পা কী চায়?,এশরার লতিফ,350,301,11.0,49,14.0,13
2,ইনসমনিয়া,ফরহাদ হোসেন,320,275,3.0,45,14.0625,8
3,সাতপুরুষে খেদু মিয়া,অসীম হিমেল,580,499,5.0,81,13.965517,19
4,হুমায়ূন আহমেদ রচনাবলী ২,হুমায়ূন আহমেদ,800,688,11.0,112,14.0,23


In [19]:
# Feature 6: Discount Category
bins = [0, 10, 20, 30, 40, 50]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
df['Discount_Category'] = pd.cut(df['Discount_Percentage'], bins=bins, labels=labels)

In [20]:
df.head()

Unnamed: 0,Title,Author,Original Price,Discounted Price,Rating,Discount_Amount,Discount_Percentage,Title_Length,Discount_Category
0,দেয়াল,হুমায়ূন আহমেদ,450,315,431.0,135,30.0,5,Medium
1,শম্পা কী চায়?,এশরার লতিফ,350,301,11.0,49,14.0,13,Low
2,ইনসমনিয়া,ফরহাদ হোসেন,320,275,3.0,45,14.0625,8,Low
3,সাতপুরুষে খেদু মিয়া,অসীম হিমেল,580,499,5.0,81,13.965517,19,Low
4,হুমায়ূন আহমেদ রচনাবলী ২,হুমায়ূন আহমেদ,800,688,11.0,112,14.0,23,Low


In [21]:
df.isnull().sum()

Title                  0
Author                 0
Original Price         0
Discounted Price       0
Rating                 0
Discount_Amount        0
Discount_Percentage    0
Title_Length           0
Discount_Category      0
dtype: int64

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
# Encode categorical features
label_encoder = LabelEncoder()
df['Discount_Category'] = label_encoder.fit_transform(df['Discount_Category'])

In [24]:
df.columns

Index(['Title', 'Author', 'Original Price', 'Discounted Price', 'Rating',
       'Discount_Amount', 'Discount_Percentage', 'Title_Length',
       'Discount_Category'],
      dtype='object')