In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [49]:
# Reading the dataset with Pandas
df = pd.read_csv("./Sales Transaction v.4a.csv")
df.head()

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country
0,581482,12/9/2019,22485,Set Of 2 Wooden Market Crates,21.47,12,17490.0,United Kingdom
1,581475,12/9/2019,22596,Christmas Star Wish List Chalkboard,10.65,36,13069.0,United Kingdom
2,581475,12/9/2019,23235,Storage Tin Vintage Leaf,11.53,12,13069.0,United Kingdom
3,581475,12/9/2019,23272,Tree T-Light Holder Willie Winkie,10.65,12,13069.0,United Kingdom
4,581475,12/9/2019,23239,Set Of 4 Knick Knack Tins Poppies,11.94,6,13069.0,United Kingdom


Data Cleaning

In [50]:
# Menghapus data duplikat berdasarkan semua kolom
df_no_duplicates = df.drop_duplicates()

# Menghapus nilai yang hilang (missing values) dari DataFrame
df_cleaned = df_no_duplicates.dropna()

# Menghilangkan baris dengan nilai Quantity negatif
df_cleaned = df_cleaned.loc[df_cleaned['Quantity'] >= 0]
df_cleaned = df_cleaned.loc[df_cleaned['Quantity'] <= 250]


df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], format='%m/%d/%Y')

In [51]:
df_cleaned

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country
0,581482,2019-12-09,22485,Set Of 2 Wooden Market Crates,21.47,12,17490.0,United Kingdom
1,581475,2019-12-09,22596,Christmas Star Wish List Chalkboard,10.65,36,13069.0,United Kingdom
2,581475,2019-12-09,23235,Storage Tin Vintage Leaf,11.53,12,13069.0,United Kingdom
3,581475,2019-12-09,23272,Tree T-Light Holder Willie Winkie,10.65,12,13069.0,United Kingdom
4,581475,2019-12-09,23239,Set Of 4 Knick Knack Tins Poppies,11.94,6,13069.0,United Kingdom
...,...,...,...,...,...,...,...,...
536320,536585,2018-12-01,37449,Ceramic Cake Stand + Hanging Cakes,20.45,2,17460.0,United Kingdom
536321,536590,2018-12-01,22776,Sweetheart 3 Tier Cake Stand,20.45,1,13065.0,United Kingdom
536322,536590,2018-12-01,22622,Box Of Vintage Alphabet Blocks,20.45,2,13065.0,United Kingdom
536323,536591,2018-12-01,37449,Ceramic Cake Stand + Hanging Cakes,20.45,1,14606.0,United Kingdom


In [52]:
df_sorted = df_cleaned.sort_values(by='Quantity', ascending=False)
df_sorted

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country
35750,579193,2019-11-28,22959,Wrap Christmas Village,6.19,250,12657.0,France
380452,550272,2019-04-15,22419,Lipstick Pen Red,10.62,250,18079.0,United Kingdom
398302,548329,2019-03-30,82581,Toilet Metal Sign,10.68,250,16584.0,United Kingdom
487840,540408,2019-01-07,85123A,Cream Hanging Heart T-Light Holder,12.86,250,16013.0,United Kingdom
70329,576374,2019-11-15,22646,Ceramic Strawberry Cake Money Bank,11.53,250,13777.0,United Kingdom
...,...,...,...,...,...,...,...,...
284134,559305,2019-07-07,21025,Space Frog,11.53,1,17892.0,United Kingdom
284135,559305,2019-07-07,21026,Space Owl,11.53,1,17892.0,United Kingdom
284136,559305,2019-07-07,22654,Deluxe Sewing Kit,16.35,1,17892.0,United Kingdom
284138,559305,2019-07-07,20733,Gold Mini Tape Measure,11.12,1,17892.0,United Kingdom


In [53]:
unique_countries = df_sorted['Country'].unique()
print("Nilai-nilai unik dalam kolom 'Country':")
print(unique_countries)

Nilai-nilai unik dalam kolom 'Country':
['France' 'United Kingdom' 'Sweden' 'Netherlands' 'Australia' 'EIRE'
 'Austria' 'Iceland' 'Denmark' 'Norway' 'Channel Islands' 'Germany'
 'Italy' 'Spain' 'Japan' 'Belgium' 'Finland' 'Switzerland' 'Portugal'
 'Singapore' 'Israel' 'Cyprus' 'Hong Kong' 'Bahrain' 'USA'
 'Czech Republic' 'United Arab Emirates' 'Canada' 'Unspecified' 'Greece'
 'Lithuania' 'Poland' 'Malta' 'European Community' 'Lebanon' 'Brazil'
 'RSA' 'Saudi Arabia']


In [83]:
# Filter data untuk negara Germany
df_country = df_sorted[df_sorted['Country'] == 'United Kingdom']

# Mengelompokkan berdasarkan nama produk dan menghitung total penjualan
product_sales = df_country.groupby('ProductName')['Quantity'].sum().reset_index()

# Mengurutkan produk berdasarkan jumlah terjual secara menurun dan mengambil 10 produk teratas
top_products = product_sales.sort_values(by='Quantity', ascending=False).head(5)

# Membuat diagram batang menggunakan plotly express untuk 10 produk teratas yang terjual di negara Germany
fig = px.bar(top_products, x='ProductName', y='Quantity',
             labels={'Quantity': 'Jumlah Terjual'},
             title='10 Produk Teratas yang Terjual di Jerman',
             height=500)

fig.update_xaxes(tickangle=45)  # Rotasi label sumbu x agar mudah dibaca
fig.show()

In [55]:
df_sorted

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country
35750,579193,2019-11-28,22959,Wrap Christmas Village,6.19,250,12657.0,France
380452,550272,2019-04-15,22419,Lipstick Pen Red,10.62,250,18079.0,United Kingdom
398302,548329,2019-03-30,82581,Toilet Metal Sign,10.68,250,16584.0,United Kingdom
487840,540408,2019-01-07,85123A,Cream Hanging Heart T-Light Holder,12.86,250,16013.0,United Kingdom
70329,576374,2019-11-15,22646,Ceramic Strawberry Cake Money Bank,11.53,250,13777.0,United Kingdom
...,...,...,...,...,...,...,...,...
284134,559305,2019-07-07,21025,Space Frog,11.53,1,17892.0,United Kingdom
284135,559305,2019-07-07,21026,Space Owl,11.53,1,17892.0,United Kingdom
284136,559305,2019-07-07,22654,Deluxe Sewing Kit,16.35,1,17892.0,United Kingdom
284138,559305,2019-07-07,20733,Gold Mini Tape Measure,11.12,1,17892.0,United Kingdom


In [104]:
top_product = top_products['ProductName'].unique()
df_filtered = df_sorted[(df_sorted['ProductName'].isin(top_product)) & (df_sorted['Country'] == 'United Kingdom')] 
df_sum_quantity = df_filtered.groupby(['ProductName', 'Date', 'Price'])['Quantity'].sum().reset_index()
df_sum_quantity

# Membuat diagram batang menggunakan plotly express untuk produk teratas yang terjual di Jerman
fig = px.line(df_sum_quantity, x='Date', y='Quantity', color='ProductName',
             labels={'Quantity': 'Jumlah Terjual'},
             title='Produk Teratas yang Terjual di United Kingdom',
             height=500)

fig.update_xaxes(tickangle=45)  # Rotasi label sumbu x agar mudah dibaca
fig.show()

In [57]:
# Import library yang dibutuhkan
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [87]:
# Menghilangkan baris dengan nilai Quantity negatif
df_sum_quantity = df_sum_quantity.loc[df_sum_quantity['Quantity'] >= 0]
df_sum_quantity = df_sum_quantity.loc[df_sum_quantity['Quantity'] <= 250]

In [100]:
# Membuat data contoh
np.random.seed(0)

df_model = df_sum_quantity[df_sum_quantity['ProductName'] == 'World War 2 Gliders Asstd Designs']
# Mengonversi kolom 'Date' ke dalam format datetime
df_model['Date'] = pd.to_datetime(df_model['Date'], format='%m/%d/%Y')

# Menghitung jumlah hari sejak tanggal pertama dalam dataset
start_date = df_model['Date'].min()
df_model['Days'] = (df_model['Date'] - start_date).dt.days
df_model



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ProductName,Date,Price,Quantity,Days
2209,World War 2 Gliders Asstd Designs,2018-12-02,10.55,96,0
2210,World War 2 Gliders Asstd Designs,2018-12-03,10.55,48,1
2211,World War 2 Gliders Asstd Designs,2018-12-05,10.55,96,3
2212,World War 2 Gliders Asstd Designs,2018-12-06,10.69,8,4
2213,World War 2 Gliders Asstd Designs,2018-12-07,10.55,48,5
...,...,...,...,...,...
2464,World War 2 Gliders Asstd Designs,2019-12-07,6.19,48,370
2465,World War 2 Gliders Asstd Designs,2019-12-07,7.24,144,370
2466,World War 2 Gliders Asstd Designs,2019-12-08,6.19,48,371
2467,World War 2 Gliders Asstd Designs,2019-12-09,6.19,1,372


In [101]:
# Mengambil fitur (X) dan target (y) dari DataFrame
X = df_model['Days'].values.reshape(-1, 1)
y = df_model['Quantity'].values.reshape(-1, 1)

# Membuat model Linear Regression
model = LinearRegression()

# Melatih model dengan seluruh data
model.fit(X, y)

# Membuat prediksi menggunakan seluruh data
y_pred = model.predict(X)

# Membuat DataFrame untuk hasil prediksi
df_pred = pd.DataFrame({
    'Days': df_model['Days'],
    'Quantity': y_pred.flatten()
})

# Menampilkan plot hasil prediksi
fig = px.scatter(df_model, x='Days', y='Quantity', title='Linear Regression with Plotly Express')
fig.add_trace(px.line(df_pred, x='Days', y='Quantity').data[0])  # Menambahkan garis regresi
fig.show()

In [102]:
# Mengambil fitur untuk prediksi (hari ke-347 sampai ke-400)
new_days = np.arange(347, 401).reshape(-1, 1)

# Membuat prediksi untuk data baru menggunakan model yang sudah dilatih
predicted_quantities_new = model.predict(new_days)

# Membuat DataFrame untuk hasil prediksi tambahan
df_pred_new = pd.DataFrame({
    'Days': new_days.flatten(),
    'Quantity': predicted_quantities_new.flatten()
})

# Menampilkan plot hasil prediksi tambahan
fig = px.scatter(df_model, x='Days', y='Quantity', title='Linear Regression with Plotly Express')

# Menambahkan garis regresi untuk data latihan (warna biru)
fig.add_trace(px.line(df_pred, x='Days', y='Quantity', line_shape='linear', ).data[0])

# Menambahkan garis regresi untuk prediksi tambahan (warna merah)
fig.add_trace(px.line(df_pred_new, x='Days', y='Quantity', line_shape='linear', ).data[0])

# Menampilkan plot
fig.show()

In [99]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Menghitung nilai MSE untuk data latihan
mse = mean_squared_error(y, y_pred)

# Menghitung nilai RMSE (Root Mean Squared Error)
rmse = np.sqrt(mse)

# Menghitung nilai MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((y - y_pred) / y)) * 100

print("Mean Squared Error (MSE) for the training data:", mse)
print("Root Mean Squared Error (RMSE) for the training data:", rmse)


Mean Squared Error (MSE) for the training data: 2571.6720854272276
Root Mean Squared Error (RMSE) for the training data: 50.711656307275426
