# Persiapan Data Penjualan

Notebook ini berisi langkah-langkah persiapan data sederhana untuk data penjualan berdasarkan demografis.

In [1]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Setting untuk visualisasi
%matplotlib inline

## 1. Membaca Data

In [2]:
# Membaca file CSV
df = pd.read_csv('../sales.csv')

# Melihat beberapa baris pertama
df.head()

Unnamed: 0,user_id,age,gender,revenue
0,C001,24,M,19500750
1,C002,19,M,12875500
2,C003,27,F,15202500
3,C004,31,F,8592480
4,C005,22,M,10533680


## 2. Pemeriksaan Data

In [3]:
# Memeriksa informasi data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  100 non-null    object
 1   age      100 non-null    int64 
 2   gender   100 non-null    object
 3   revenue  100 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ KB


In [4]:
# Memeriksa data yang hilang
print("Jumlah data yang hilang:")
print(df.isnull().sum())

Jumlah data yang hilang:
user_id    0
age        0
gender     0
revenue    0
dtype: int64


In [5]:
# Memeriksa statistik deskriptif
df.describe()

Unnamed: 0,age,revenue
count,100.0,100.0
mean,32.72,13644930.0
std,7.937992,3812007.0
min,18.0,7726680.0
25%,27.0,10613900.0
50%,31.0,13425750.0
75%,39.0,15293620.0
max,52.0,22629360.0


## 3. Transformasi Data

In [6]:
# Konversi gender menjadi variabel dummy (0 dan 1)
df['gender_code'] = df['gender'].map({'M': 1, 'F': 0})

# Membuat kelompok umur
df['age_group'] = pd.cut(df['age'], bins=[18, 25, 35, 45, 55], labels=['18-25', '26-35', '36-45', '46-55'])

# Melihat hasil transformasi
df.head()

Unnamed: 0,user_id,age,gender,revenue,gender_code,age_group
0,C001,24,M,19500750,1,18-25
1,C002,19,M,12875500,1,18-25
2,C003,27,F,15202500,0,26-35
3,C004,31,F,8592480,0,26-35
4,C005,22,M,10533680,1,18-25


In [7]:
# Membuat variabel dummy untuk kelompok umur
age_dummies = pd.get_dummies(df['age_group'], prefix='age_group')

# Menggabungkan dengan dataframe utama
df = pd.concat([df, age_dummies], axis=1)

# Melihat hasil transformasi
df.head()

Unnamed: 0,user_id,age,gender,revenue,gender_code,age_group,age_group_18-25,age_group_26-35,age_group_36-45,age_group_46-55
0,C001,24,M,19500750,1,18-25,1,0,0,0
1,C002,19,M,12875500,1,18-25,1,0,0,0
2,C003,27,F,15202500,0,26-35,0,1,0,0
3,C004,31,F,8592480,0,26-35,0,1,0,0
4,C005,22,M,10533680,1,18-25,1,0,0,0


## 4. Normalisasi Data

In [8]:
# Normalisasi variabel numerik (revenue)
from sklearn.preprocessing import MinMaxScaler

# Membuat scaler
scaler = MinMaxScaler()

# Menerapkan normalisasi pada revenue
df['revenue_normalized'] = scaler.fit_transform(df[['revenue']])

# Melihat hasil normalisasi
df.head()

Unnamed: 0,user_id,age,gender,revenue,gender_code,age_group,age_group_18-25,age_group_26-35,age_group_36-45,age_group_46-55,revenue_normalized
0,C001,24,M,19500750,1,18-25,1,0,0,0,0.790064
1,C002,19,M,12875500,1,18-25,1,0,0,0,0.345496
2,C003,27,F,15202500,0,26-35,0,1,0,0,0.501643
3,C004,31,F,8592480,0,26-35,0,1,0,0,0.058097
4,C005,22,M,10533680,1,18-25,1,0,0,0,0.188355


## 5. Split Data

In [9]:
# Mempersiapkan feature dan target
X = df[['age', 'gender_code', 'age_group_18-25', 'age_group_26-35', 'age_group_36-45', 'age_group_46-55']]
y = df['revenue']

# Memisahkan data training dan testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Memeriksa ukuran data
print("Ukuran data training:", X_train.shape)
print("Ukuran data testing:", X_test.shape)

Ukuran data training: (70, 6)
Ukuran data testing: (30, 6)


## 6. Menyimpan Data yang Telah Dipersiapkan

In [10]:
# Membuat direktori models jika belum ada
import os
if not os.path.exists('../data'):
    os.makedirs('../data')

# Menyimpan data yang telah dipersiapkan
df.to_csv('../data/sales_data_prepared.csv', index=False)

# Menyimpan data training dan testing
pd.DataFrame(X_train).to_csv('../data/X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('../data/X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('../data/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/y_test.csv', index=False)

print("Data yang telah dipersiapkan berhasil disimpan.")

Data yang telah dipersiapkan berhasil disimpan.
