In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sales-forecasting/sample_submission.csv
/kaggle/input/sales-forecasting/train.csv
/kaggle/input/sales-forecasting/test.csv
/kaggle/input/sales-forecasting/EconomicIndicators.csv


In [2]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/sales-forecasting/train.csv')
test_df = pd.read_csv('/kaggle/input/sales-forecasting/test.csv')
economic_indicators_df = pd.read_csv('/kaggle/input/sales-forecasting/EconomicIndicators.csv')
sample_submission_df = pd.read_csv('/kaggle/input/sales-forecasting/sample_submission.csv')

# Display the first few rows to understand the structure
print("Train Data:")
print(train_df.head())
print("\nTest Data:")
print(test_df.head())
print("\nEconomic Indicators Data:")
print(economic_indicators_df.head())
print("\nSample Submission:")
print(sample_submission_df.head())

# Check for missing values
print("\nMissing Values in Train Data:")
print(train_df.isnull().sum())
print("\nMissing Values in Test Data:")
print(test_df.isnull().sum())
print("\nMissing Values in Economic Indicators Data:")
print(economic_indicators_df.isnull().sum())

Train Data:
   ID Company Quarter  QuickRatio  InventoryRatio  RevenueGrowth  \
0   0   CMP01      Q1        2.02            7.71           0.05   
1   1   CMP01      Q2        2.01            4.10           0.03   
2   2   CMP01      Q3        2.02            6.79           0.06   
3   3   CMP01      Q4        1.98            3.97           0.01   
4   4   CMP01      Q5        1.96            7.41          -0.07   

   MarketshareChange Bond rating Stock rating Region           Industry  \
0              -0.04         CCC          Buy  South  Metal Fabrication   
1               0.00         CCC         Hold  South  Metal Fabrication   
2              -0.02         CCC          Buy  South  Metal Fabrication   
3               0.02         CCC          Buy  South  Metal Fabrication   
4               0.02         CCC          Buy  South  Metal Fabrication   

    Sales  
0  1517.0  
1  2968.0  
2  1497.0  
3  2929.0  
4  1452.0  

Test Data:
   ID Company Quarter  QuickRatio  Inventory

In [3]:
# Handling missing values for InventoryRatio without chaining
train_df['InventoryRatio'] = train_df['InventoryRatio'].fillna(train_df['InventoryRatio'].median())
test_df['InventoryRatio'] = test_df['InventoryRatio'].fillna(test_df['InventoryRatio'].median())

# Removing rows with missing Sales values in training data
train_df.dropna(subset=['Sales'], inplace=True)

# Convert Quarter into numerical values
def quarter_to_month(quarter):
    return (int(quarter[-1]) - 1) * 3 + 1

train_df['Month'] = train_df['Quarter'].apply(quarter_to_month)
test_df['Month'] = test_df['Quarter'].apply(quarter_to_month)

# Merge economic indicators with train and test data
train_df = pd.merge(train_df, economic_indicators_df, on='Month')
test_df = pd.merge(test_df, economic_indicators_df, on='Month')


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from pandas.api.types import is_string_dtype, is_numeric_dtype

# A simple function to preprocess the data
def preprocess_data(df):
    for n, c in df.items():
        if is_string_dtype(c):
            df[n] = c.astype('category').cat.as_ordered()
    return pd.get_dummies(df, dummy_na=True)

# Preprocess the training and test datasets
X_train = preprocess_data(train_df.drop(['Sales', 'ID', 'Month'], axis=1))
y_train = train_df['Sales']
X_test = preprocess_data(test_df.drop(['ID', 'Month'], axis=1))

# Align the features of X_train and X_test
X_train, X_test = X_train.align(X_test, join='outer', axis=1, fill_value=0)

# Splitting the preprocessed training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model selection and training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

# Evaluation
y_pred_train = model.predict(X_train_split)
mae_train = mean_absolute_error(y_train_split, y_pred_train)
print(f'Mean Absolute Error on Training Set: {mae_train}')

y_pred_val = model.predict(X_val_split)
mae_val = mean_absolute_error(y_val_split, y_pred_val)
print(f'Mean Absolute Error on Validation Set: {mae_val}')

# Predicting on the test set and preparing the submission file
predictions = model.predict(X_test)
submission_df = pd.DataFrame({'ID': test_df['ID'], 'Sales': predictions})
submission_df.to_csv('/kaggle/working/submission.csv', index=False)



Mean Absolute Error on Training Set: 342.3149761904762
Mean Absolute Error on Validation Set: 1022.2146666666666


In [5]:
# Rounding the 'Sales' column to the nearest whole number
submission_df['Sales'] = submission_df['Sales'].round(0).astype(int)
print(submission_df.head())

   ID  Sales
0   7   2699
1   8   2466
2  16   4212
3  17   3893
4  25   4574
