### Part I
# **Sales Prediction**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
# loading data
sales_data = pd.read_csv("train.csv")
store_data = pd.read_csv("store.csv")

In [3]:
sales_data.head()

Unnamed: 0,Store_id,DayOfWeek,Date,Sales,NumberOfCustomers,Is_Open,BOGO,Holiday
0,1,5,7/31/2015,5263,555,1,1,1
1,2,5,7/31/2015,6064,625,1,1,1
2,3,5,7/31/2015,8314,821,1,1,1
3,4,5,7/31/2015,13995,1498,1,1,1
4,5,5,7/31/2015,4822,559,1,1,1


In [4]:
merged_data = sales_data.merge(store_data, on="Store_id", how="left")

In [5]:
merged_data.shape

(1017209, 17)

In [6]:
merged_data.head()

Unnamed: 0,Store_id,DayOfWeek,Date,Sales,NumberOfCustomers,Is_Open,BOGO,Holiday,RetailType,Stock variety,DistanceToRivalStore,RivalOpeningMonth,RivalEntryYear,ContinuousBogo,ContinuousBogoSinceWeek,ContinuousBogoSinceYear,ContinuousBogoMonths
0,1,5,7/31/2015,5263,555,1,1,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,7/31/2015,6064,625,1,1,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,7/31/2015,8314,821,1,1,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,7/31/2015,13995,1498,1,1,1,c,c,620.0,9.0,2009.0,0,,,
4,5,5,7/31/2015,4822,559,1,1,1,a,a,29910.0,4.0,2015.0,0,,,


In [7]:
merged_data = merged_data[pd.to_datetime(merged_data['Date'], errors='coerce').notna()]

In [8]:
# transformin Date column to time format
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

In [9]:
# extracting year, month, day, week
merged_data['Year'] = merged_data['Date'].dt.year
merged_data['Month'] = merged_data['Date'].dt.month
merged_data['Day'] = merged_data['Date'].dt.day
merged_data['WeekOfYear'] = merged_data['Date'].dt.isocalendar().week

In [10]:
# deleting the date column since we extracted what we needed
merged_data.drop(columns=['Date'], inplace=True)

In [11]:
merged_data.head()

Unnamed: 0,Store_id,DayOfWeek,Sales,NumberOfCustomers,Is_Open,BOGO,Holiday,RetailType,Stock variety,DistanceToRivalStore,RivalOpeningMonth,RivalEntryYear,ContinuousBogo,ContinuousBogoSinceWeek,ContinuousBogoSinceYear,ContinuousBogoMonths,Year,Month,Day,WeekOfYear
0,1,5,5263,555,1,1,1,c,a,1270.0,9.0,2008.0,0,,,,2015,7,31,31
1,2,5,6064,625,1,1,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31,31
2,3,5,8314,821,1,1,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,31
3,4,5,13995,1498,1,1,1,c,c,620.0,9.0,2009.0,0,,,,2015,7,31,31
4,5,5,4822,559,1,1,1,a,a,29910.0,4.0,2015.0,0,,,,2015,7,31,31


In [12]:
merged_data['ContinuousBogoMonths'] = pd.to_numeric(merged_data['ContinuousBogoMonths'], errors='coerce')

# one-hot encoding the alphabet columns
merged_data = pd.get_dummies(merged_data, columns=['RetailType', 'Stock variety'], drop_first=True)

# replacing missing data in DistanceToRivalStore with median
merged_data['DistanceToRivalStore'] = merged_data['DistanceToRivalStore'].fillna(merged_data['DistanceToRivalStore'].median())

# replacing the rest with 0
merged_data = merged_data.fillna(0)

In [13]:
# is not accessible at training
merged_data.drop(columns=['NumberOfCustomers'], inplace=True)

In [14]:
# Sales is our target
X = merged_data.drop(columns=['Sales'])
y = merged_data['Sales']

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [16]:
X.dtypes

Store_id                     int64
DayOfWeek                    int64
Is_Open                      int64
BOGO                         int64
Holiday                      int64
DistanceToRivalStore       float64
RivalOpeningMonth          float64
RivalEntryYear             float64
ContinuousBogo               int64
ContinuousBogoSinceWeek    float64
ContinuousBogoSinceYear    float64
ContinuousBogoMonths       float64
Year                         int32
Month                        int32
Day                          int32
WeekOfYear                  UInt32
RetailType_b                  bool
RetailType_c                  bool
RetailType_d                  bool
Stock variety_b               bool
Stock variety_c               bool
dtype: object

In [17]:
# sort
merged_data = merged_data.sort_values(by="Year")

split_index = int(0.7 * len(merged_data))

# train and test split
X_train, X_test = X_scaled[:split_index], X_scaled[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

# random forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(y_test, y_pred, model_name):
    print(f"model {model_name}:")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
    print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")
    print("-" * 30)

# elavuating models
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

model Linear Regression:
MAE: 1765.43
MSE: 6070629.12
R^2 Score: 0.55
------------------------------
model Random Forest:
MAE: 605.65
MSE: 944018.79
R^2 Score: 0.93
------------------------------


In [19]:
feature_importance = rf_model.feature_importances_

importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [20]:
# important features in Random Forest Model
importance_df

Unnamed: 0,Feature,Importance
2,Is_Open,0.46467
5,DistanceToRivalStore,0.107365
0,Store_id,0.085494
3,BOGO,0.072513
7,RivalEntryYear,0.042058
1,DayOfWeek,0.037509
6,RivalOpeningMonth,0.035347
14,Day,0.026125
15,WeekOfYear,0.024144
10,ContinuousBogoSinceYear,0.019485


### Part II
# **Sentiment Analysis**

In [21]:
# pip install nltk

In [22]:
import re
import nltk  #natural language toolkit
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [23]:
# loading data
df = pd.read_csv("train_sentiment.csv")

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,rating
0,0,Looks durable Charging is fine tooNo complains...,4.2
1,1,I ordered this cable to connect my phone to An...,4.0
2,2,"Not quite durable and sturdy,https://m.media-a...",3.9
3,3,"Good product,long wire,Charges good,Nice,I bou...",4.2
4,4,"Bought this instead of original apple, does th...",4.2


In [25]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

df['Cleaned_Review'] = df['review'].apply(clean_text)

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stopwords.words('english')[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [28]:
df['Cleaned_Review'].head()

0    looks durable charging is fine toono complains...
1    i ordered this cable to connect my phone to an...
2    not quite durable and sturdyhttpsmmediaamazonc...
3    good productlong wirecharges goodnicei bought ...
4    bought this instead of original apple does the...
Name: Cleaned_Review, dtype: object

In [29]:
X_tfidf = df.drop(columns=['rating'])
y = df['rating']

In [30]:
y.head()

0    4.2
1    4.0
2    3.9
3    4.2
4    4.2
Name: rating, dtype: object

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# transforming text to numerical vectors with TF-IDF
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(df['Cleaned_Review']).toarray()

In [32]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [33]:
!pip install gensim



In [34]:
import gensim

In [35]:
from gensim.models import Word2Vec

In [36]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\msi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [37]:
# turning the text into a list
sentences = [word_tokenize(text) for text in df['Cleaned_Review']]

# training Word2Vec model
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# extracting the vector of each sentence
def get_sentence_vector(sentence, model):
    words = word_tokenize(sentence)
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# turning each sentence into a numerical vector
X_w2v = np.array([get_sentence_vector(text, w2v_model) for text in df['Cleaned_Review']])

In [38]:
X_train_w2v, X_test_w2v, _, _ = train_test_split(X_w2v, y, test_size=0.3, random_state=42)

In [39]:
# pip install sentence-transformers

In [40]:
import sentence_transformers

  from .autonotebook import tqdm as notebook_tqdm


In [41]:
from sentence_transformers import SentenceTransformer

In [42]:
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

X_bert = bert_model.encode(df['Cleaned_Review'])

X_train_bert, X_test_bert, _, _ = train_test_split(X_bert, y, test_size=0.3, random_state=42)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

# setting hyperparameters for each model
def hyperparameter_tuning(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [45]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
param_grid_lr = {'C': [0.1, 1, 10, 100]}
best_lr_tfidf = hyperparameter_tuning(lr, param_grid_lr, X_train_tfidf, y_train)
best_lr_w2v = hyperparameter_tuning(lr, param_grid_lr, X_train_w2v, y_train)
best_lr_bert = hyperparameter_tuning(lr, param_grid_lr, X_train_bert, y_train)



In [None]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [100, 200, 500], 'max_depth': [None, 10, 20, 30]}
best_rf_tfidf = hyperparameter_tuning(rf, param_grid_rf, X_train_tfidf, y_train)
best_rf_w2v = hyperparameter_tuning(rf, param_grid_rf, X_train_w2v, y_train)
best_rf_bert = hyperparameter_tuning(rf, param_grid_rf, X_train_bert, y_train)



In [None]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
param_grid_knn = {'n_neighbors': [3, 5, 7, 9]}
best_knn_tfidf = hyperparameter_tuning(knn, param_grid_knn, X_train_tfidf, y_train)
best_knn_w2v = hyperparameter_tuning(knn, param_grid_knn, X_train_w2v, y_train)
best_knn_bert = hyperparameter_tuning(knn, param_grid_knn, X_train_bert, y_train)

In [None]:
# ارزیابی مدل‌ها
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average='weighted')

# چاپ نتایج
print(f'Logistic Regression TF-IDF F1 Score: {evaluate_model(best_lr_tfidf, X_test_tfidf, y_test)}')
print(f'Logistic Regression Word2Vec F1 Score: {evaluate_model(best_lr_w2v, X_test_w2v, y_test)}')
print(f'Logistic Regression BERT F1 Score: {evaluate_model(best_lr_bert, X_test_bert, y_test)}')

print(f'Random Forest TF-IDF F1 Score: {evaluate_model(best_rf_tfidf, X_test_tfidf, y_test)}')
print(f'Random Forest Word2Vec F1 Score: {evaluate_model(best_rf_w2v, X_test_w2v, y_test)}')
print(f'Random Forest BERT F1 Score: {evaluate_model(best_rf_bert, X_test_bert, y_test)}')

print(f'K-Nearest Neighbors TF-IDF F1 Score: {evaluate_model(best_knn_tfidf, X_test_tfidf, y_test)}')
print(f'K-Nearest Neighbors Word2Vec F1 Score: {evaluate_model(best_knn_w2v, X_test_w2v, y_test)}')
print(f'K-Nearest Neighbors BERT F1 Score: {evaluate_model(best_knn_bert, X_test_bert, y_test)}')