In [12]:
# ### Step 1: Data Collection and Preparation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the dataset
data = pd.read_csv('Historical_Data.csv')
# #### 1. Load the Dataset
# The dataset has been loaded successfully and here is the first few rows of the data:

# ```
#        Date  Article_ID Country_Code  Sold_Units
# 0  20170817        1132           AT           1
# 1  20170818        1132           AT           1
# 2  20170821        1132           AT           1
# 3  20170822        1132           AT           1
# 4  20170906        1132           AT           1
# ```



# #### 3. Feature Engineering
# We'll extract year and month from the 'Date' column.

# ```python
# data['Date'] = pd.to_datetime(data['Date'], format='%Y%m%d')
# data['Year'] = data['Date'].dt.year
# data['Month'] = data['Date'].dt.month
# data.head()
# ```

# ### Step 3: Exploratory Data Analysis (EDA)

# #### 1. Descriptive Statistics
# We'll summarize the main characteristics of the dataset.

# ```python
# data.describe()
# ```

# #### 2. Data Visualization
# We'll visualize the sales distribution and sales over time.

# ```python
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Sales distribution
# sns.histplot(data['Sold_Units'], kde=True)
# plt.title('Sales Distribution')
# plt.show()

# # Sales over time
# plt.figure(figsize=(12, 6))
# plt.plot(data['Date'], data['Sold_Units'])
# plt.title('Sales Over Time')
# plt.xlabel('Date')
# plt.ylabel('Sold Units')
# plt.show()
# ```

# #### 3. Correlation Analysis
# We'll identify the correlation between different features.

# ```python
# correlation_matrix = data.corr()
# sns.heatmap(correlation_matrix, annot=True)
# plt.title('Correlation Matrix')
# plt.show()
# ```

# ### Step 4: Probability Distributions

# #### 1. Plot Probability Distributions
# We'll plot and analyze the probability distributions of the key features.

# ```python
# sns.kdeplot(data['Sold_Units'])
# plt.title('Sold Units Probability Distribution')
# plt.show()
# ```

# #### 2. Check Normality
# We'll use Q-Q plots to check if the data follows a normal distribution.

# ```python
# from scipy import stats
# import matplotlib.pyplot as plt
# import numpy as np

# stats.probplot(data['Sold_Units'], dist="norm", plot=plt)
# plt.title('Q-Q Plot of Sold Units')
# plt.show()
# ```

# ### Step 5: Linear Regression Model

# #### 1. Split the Data
# We'll divide the dataset into training and testing sets.

# ```python
# from sklearn.model_selection import train_test_split

# X = data[['Year', 'Month']]  # Example features
# y = data['Sold_Units']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ```

# #### 2. Train the Model
# We'll use sklearn to train a linear regression model.

# ```python
# from sklearn.linear_model import LinearRegression

# model = LinearRegression()
# model.fit(X_train, y_train)
# ```

# #### 3. Evaluate the Model
# We'll evaluate the model's performance using metrics like Mean Squared Error (MSE) and R-squared.

# ```python
# from sklearn.metrics import mean_squared_error, r2_score

# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# mse, r2
# ```

# ### Step 6: Interpretation and Reporting

# #### 1. Interpret the Results
# We'll analyze the model coefficients and their significance.

# ```python
# coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
# coefficients
# ```

# #### 2. Visualize the Predictions
# We'll plot the actual vs predicted sales.

# ```python
# plt.scatter(y_test, y_pred)
# plt.xlabel('Actual Sold Units')
# plt.ylabel('Predicted Sold Units')
# plt.title('Actual vs Predicted Sold Units')
# plt.show()
# ```

# #### 3. Report Findings
# We'll summarize the key findings, including descriptive statistics, data visualizations, model performance, and insights gained.

# ---

# Let's execute these steps one by one.

# ### Step 1: Data Collection and Preparation

# - The dataset has been loaded successfully.
# - The dataset contains 4849 entries and 4 columns: `Date`, `Article_ID`, `Country_Code`, and `Sold_Units`.
# - There are no missing values or duplicates in the dataset.
# - We extracted `Year` and `Month` from the `Date` column for further analysis.

# ### Step 3: Exploratory Data Analysis (EDA)

# #### 1. Descriptive Statistics
# We have summarized the main characteristics of the dataset.

# #### 2. Data Visualization
# Let's visualize the sales distribution and sales over time.

# ```python
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Sales distribution
# sns.histplot(data['Sold_Units'], kde=True)
# plt.title('Sales Distribution')
# plt.show()

# # Sales over time
# plt.figure(figsize=(12, 6))
# plt.plot(data['Date'], data['Sold_Units'])
# plt.title('Sales Over Time')
# plt.xlabel('Date')
# plt.ylabel('Sold Units')
# plt.show()
# ```

# #### 3. Correlation Analysis
# We'll identify the correlation between different features.

# ```python
# correlation_matrix = data.corr()
# sns.heatmap(correlation_matrix, annot=True)
# plt.title('Correlation Matrix')
# plt.show()
# ```

# ### Step 4: Probability Distributions

# #### 1. Plot Probability Distributions
# We'll plot and analyze the probability distributions of the key features.

# ```python
# sns.kdeplot(data['Sold_Units'])
# plt.title('Sold Units Probability Distribution')
# plt.show()
# ```

# #### 2. Check Normality
# We'll use Q-Q plots to check if the data follows a normal distribution.

# ```python
# from scipy import stats
# import matplotlib.pyplot as plt
# import numpy as np

# stats.probplot(data['Sold_Units'], dist="norm", plot=plt)
# plt.title('Q-Q Plot of Sold Units')
# plt.show()
# ```

# Let's execute the EDA steps first.

# ### Step 3: Exploratory Data Analysis (EDA) Results

# 1. **Sales Distribution:**
#    - The sales distribution is heavily right-skewed, indicating most sales are of lower quantities, with a few instances of higher sales.

# 2. **Sales Over Time:**
#    - Sales show fluctuations over time, with no clear upward or downward trend.

# 3. **Correlation Analysis:**
#    - There is no strong correlation between the variables (`Year`, `Month`, `Sold_Units`), as seen in the correlation heatmap.

# ### Step 4: Probability Distributions

# 1. **Sales Probability Distribution:**
#    - The distribution of sold units is right-skewed, similar to the histogram.

# 2. **Q-Q Plot:**
#    - The Q-Q plot indicates that the `Sold_Units` data does not follow a normal distribution, as the points deviate significantly from the reference line.

# ### Step 5: Linear Regression Model

# #### 1. Split the Data
# We'll divide the dataset into training and testing sets.

# ```python
# from sklearn.model_selection import train_test_split

# X = data[['Year', 'Month']]  # Example features
# y = data['Sold_Units']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ```

# #### 2. Train the Model
# We'll use sklearn to train a linear regression model.

# ```python
# from sklearn.linear_model import LinearRegression

# model = LinearRegression()
# model.fit(X_train, y_train)
# ```

# #### 3. Evaluate the Model
# We'll evaluate the model's performance using metrics like Mean Squared Error (MSE) and R-squared.

# ```python
# from sklearn.metrics import mean_squared_error, r2_score

# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# mse, r2
# ```

# #### 4. Interpret the Results
# We'll analyze the model coefficients and their significance.

# ```python
# coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
# coefficients
# ```

# #### 5. Visualize the Predictions
# We'll plot the actual vs predicted sales.

# ```python
# plt.scatter(y_test, y_pred)
# plt.xlabel('Actual Sold Units')
# plt.ylabel('Predicted Sold Units')
# plt.title('Actual vs Predicted Sold Units')
# plt.show()
# ```

# Let's execute the linear regression model steps.

# It seems like I can’t do more advanced data analysis right now. Please try again later. 

# In the meantime, I can provide you with the code you need to complete the linear regression analysis on your dataset. You can run it in your local environment or any Python-enabled platform.

# ```python
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from scipy import stats

# # Load the dataset
# file_path = 'path_to_your_dataset/Historical_Data.csv'
# data = pd.read_csv(file_path)

# # Convert Date column to datetime
# data['Date'] = pd.to_datetime(data['Date'], format='%Y%m%d')
# data['Year'] = data['Date'].dt.year
# data['Month'] = data['Date'].dt.month

# # Split the data
# X = data[['Year', 'Month']]  # Example features
# y = data['Sold_Units']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train the model
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Evaluate the model
# y_pred = model.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Interpret the results
# coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])

# # Visualize the predictions
# plt.scatter(y_test, y_pred)
# plt.xlabel('Actual Sold Units')
# plt.ylabel('Predicted Sold Units')
# plt.title('Actual vs Predicted Sold Units')
# plt.show()

# print('Mean Squared Error:', mse)
# print('R-squared:', r2)
# print('Coefficients:', coefficients)
# ```

# You can copy this code and run it in your environment. If you encounter any issues or need further assistance, feel free to ask!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4849 entries, 0 to 4848
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          4849 non-null   int64 
 1   Article_ID    4849 non-null   int64 
 2   Country_Code  4849 non-null   object
 3   Sold_Units    4849 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 151.7+ KB
None
               Date  Article_ID   Sold_Units
count  4.849000e+03  4849.00000  4849.000000
mean   2.017794e+07  2314.55331     1.966797
std    5.981690e+03  1813.23926     2.373171
min    2.017010e+07   332.00000     1.000000
25%    2.017093e+07   409.00000     1.000000
50%    2.018041e+07  1923.00000     1.000000
75%    2.018092e+07  3448.00000     2.000000
max    2.019021e+07  5984.00000   101.000000


In [13]:
# #### 2. Inspect the Dataset
# Let's inspect the dataset further.

# ```python
print(data.info())
print(data.describe())
# ```

# We'll proceed to check for missing values and duplicates.

# ### Step 2: Data Cleaning

# #### 1. Handle Missing Values
# We'll check for missing values.

# ```python
missing_values = data.isnull().sum()
missing_values
# ```

# #### 2. Remove Duplicates
# We'll check and remove duplicates if any.

# ```python
data = data.drop_duplicates()
# ```

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4849 entries, 0 to 4848
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          4849 non-null   int64 
 1   Article_ID    4849 non-null   int64 
 2   Country_Code  4849 non-null   object
 3   Sold_Units    4849 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 151.7+ KB
None
               Date  Article_ID   Sold_Units
count  4.849000e+03  4849.00000  4849.000000
mean   2.017794e+07  2314.55331     1.966797
std    5.981690e+03  1813.23926     2.373171
min    2.017010e+07   332.00000     1.000000
25%    2.017093e+07   409.00000     1.000000
50%    2.018041e+07  1923.00000     1.000000
75%    2.018092e+07  3448.00000     2.000000
max    2.019021e+07  5984.00000   101.000000
