# Understanding and Cleaning the data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sales = pd.read_csv('/content/drive/MyDrive/term3/my project/raw data/View_Sales (PSP)_Migrated Data_raw data.csv')

In [None]:
sales.head()

In [None]:
sales_copy = sales.copy()

In [None]:
# replace "" in each column with "_" and all of them are in lower case for easy understaning
sales_copy.columns = [col.replace(' ', '_').lower() for col in sales_copy.columns]

In [None]:
sales_copy.info()

In [None]:
#change date column into date time format
sales_copy.date = pd.to_datetime(sales_copy.date)
sales_copy.date.head()

In [None]:
#remove 'blank','path','zero','densificator' columns>>> did not have any meaning
sales_copy = sales_copy.drop(['blank', 'path', 'zero','densificator','number_of_records','sales_in_seleted_currency','business_unit_1','business_unit_2','business_unit_3','business_unit_4','business_unit_5'], axis=1)

In [None]:
sales_copy.columns

In [None]:
# Fill NaN values in the 'incomes' column with 0
# Replace empty strings in the 'incomes' column with 0
sales_copy['incomes'] = sales_copy['incomes'].fillna(0)
sales_copy['incomes'] = sales_copy['incomes'].replace('', 0)

In [None]:
# Fill NaN values in the 'returns' column with 0
# Replace empty strings in the 'incomes' column with 0
sales_copy['returns'] = sales_copy['returns'].fillna(0)
sales_copy['returns'] = sales_copy['returns'].replace('', 0)

#Data pre-processing

In [None]:
# Convert categorical data to numeric using label encoding
sales_encoded = sales_copy

# You can use a mapping or LabelEncoder for this
for col in sales_encoded.select_dtypes(include=['object']).columns:
    sales_encoded[col] = sales_encoded[col].astype('category').cat.codes

# Now, calculate the correlation matrix
correlation_matrix = sales_encoded.corr()

# Visualize the heatmap with encoded non-numeric data
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (with Encoded Categorical Data)')
plt.show()


# Data Modeling

In [None]:
# take incomes,returns, type as X and sales_m.usd as y

X = sales_encoded[['incomes', 'returns', 'type','year','month']]
y = sales_encoded['sales_m.usd']

In [None]:
# prompt: split my data set with test size as 0.2

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Random forest regression

In [None]:
# prompt: now create Random forest regression model and give me the mean square error , mean absolute error, r - square

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

In [None]:
# Create a DataFrame for the next year's projections
next_year = pd.DataFrame({
    'year': [sales_encoded['year'].max() + 1] * 12,  # Next year's value
    'month': list(range(1, 13)),  # Months 1 to 12
    'incomes': [sales_encoded['incomes'].mean()] * 12,
    'returns': [sales_encoded['returns'].mean()] * 12,
    'type': [sales_encoded['type'].mode()[0]] * 12
})

# Reorder columns to match training data
next_year = next_year.reindex(columns=['incomes', 'returns', 'type', 'year', 'month'])

# Make predictions using the trained model
projections = model.predict(next_year)

# Add the projections to the DataFrame
next_year['sales_m.usd'] = projections

# Display the next year's projections
print(next_year)

In [None]:
# ... (previous code to generate projections) ...

# Create a plot to visualize the trend
plt.figure(figsize=(10, 6))
plt.plot(next_year['month'], projections, marker='o', linestyle='-')
plt.title('Projected Sales Trend for Next Year')
plt.xlabel('Month')
plt.ylabel('Sales (USD)')
plt.xticks(range(1, 13))  # Set x-axis ticks to represent months
plt.grid(True)
plt.show()

 Linear Regression

In [None]:
# prompt: # prompt: take incomes,returns, type as X and sales_m.usd as y
# X = sales_encoded[['incomes', 'returns', 'type','year','month']]
# y = sales_encoded['sales_m.usd']
# do the regression model

from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
linear_model = LinearRegression()

# Train the model
linear_model.fit(X_train, y_train)

# Make predictions on the test set
linear_y_pred = linear_model.predict(X_test)

# Calculate evaluation metrics for the linear regression model
linear_mse = mean_squared_error(y_test, linear_y_pred)
linear_mae = mean_absolute_error(y_test, linear_y_pred)
linear_r2 = r2_score(y_test, linear_y_pred)

print(f"Linear Regression - Mean Squared Error (MSE): {linear_mse}")
print(f"Linear Regression - Mean Absolute Error (MAE): {linear_mae}")
print(f"Linear Regression - R-squared (R2): {linear_r2}")



In [None]:
# Create a DataFrame for the next year's projections
next_year = pd.DataFrame({
    'year': [sales_encoded['year'].max() + 1] * 12,  # Next year's value
    'month': list(range(1, 13)),  # Months 1 to 12
    'incomes': [sales_encoded['incomes'].mean()] * 12,
    'returns': [sales_encoded['returns'].mean()] * 12,
    'type': [sales_encoded['type'].mode()[0]] * 12
})

# Reorder columns to match training data
next_year = next_year.reindex(columns=['incomes', 'returns', 'type', 'year', 'month'])

# Make predictions for the next year using the linear regression model
linear_projections = linear_model.predict(next_year)

# Add the linear regression projections to the DataFrame
next_year['linear_sales_m.usd'] = linear_projections

# Display the next year's projections from the linear regression model
print(next_year)

#Plot for linear regression
plt.figure(figsize=(10, 6))
plt.plot(next_year['month'], linear_projections, marker='o', linestyle='-')
plt.title('Linear Regression: Projected Sales Trend for Next Year')
plt.xlabel('Month')
plt.ylabel('Sales (USD)')
plt.xticks(range(1, 13))  # Set x-axis ticks to represent months
plt.grid(True)
plt.show()

K-Nearest Neighbors (KNN)

In [None]:
# prompt: take incomes,returns, type as X and sales_m.usd as y

X = sales_encoded[['incomes', 'returns', 'type','year','month']]
y = sales_encoded['sales_m.usd']

In [None]:

from sklearn.neighbors import KNeighborsRegressor

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN model (you can adjust the number of neighbors 'n_neighbors')
knn_model = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
knn_y_pred = knn_model.predict(X_test)

# Evaluate the KNN model
knn_mse = mean_squared_error(y_test, knn_y_pred)
knn_mae = mean_absolute_error(y_test, knn_y_pred)
knn_r2 = r2_score(y_test, knn_y_pred)

print(f"KNN - Mean Squared Error (MSE): {knn_mse}")
print(f"KNN - Mean Absolute Error (MAE): {knn_mae}")
print(f"KNN - R-squared (R2): {knn_r2}")



In [None]:
# Predict sales for the next year using the KNN model

# Create a DataFrame for the next year's projections
next_year = pd.DataFrame({
    'year': [sales_encoded['year'].max() + 1] * 12,  # Next year's value
    'month': list(range(1, 13)),  # Months 1 to 12
    'incomes': [sales_encoded['incomes'].mean()] * 12,
    'returns': [sales_encoded['returns'].mean()] * 12,
    'type': [sales_encoded['type'].mode()[0]] * 12
})

# Reorder columns to match training data
next_year = next_year.reindex(columns=['incomes', 'returns', 'type', 'year', 'month'])

next_year['knn_sales_m.usd'] = knn_model.predict(next_year)
print(next_year)

#Plot for KNN
plt.figure(figsize=(10, 6))
plt.plot(next_year['month'], next_year['knn_sales_m.usd'], marker='o', linestyle='-')
plt.title('KNN: Projected Sales Trend for Next Year')
plt.xlabel('Month')
plt.ylabel('Sales (USD)')
plt.xticks(range(1, 13))  # Set x-axis ticks to represent months
plt.grid(True)
plt.show()