# Berbagi Wawasan #1

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/baraaksayeth/python-data-analytics/blob/main/index.ipynb)


## Membaca File CSV, XML, TXT, XLSX

In [4]:
import pandas as pd

pd.set_option('display.max_columns', None)

### File CSV

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/baraaksayeth/python-data-analytics/main/datasets/retail.csv')
df.head(n=15)

# sep = ','
# header = baris berapa yang mau dijadikan header
# names = [Kalo mau, merubah, nama, kolom]
# skiprows = baris berapa yang ingin di skip
# usecols = [Kolom, yang, mau, diambil]
# nrows = jumlah baris yang mau diambil

### File XLSX

In [None]:
df_excel = pd.read_excel('https://raw.githubusercontent.com/baraaksayeth/python-data-analytics/main/datasets/retail.xlsx', sheet_name=None)
df_excel['Worksheet'].head()

### File TXT

In [None]:
df_txt = pd.read_csv('https://raw.githubusercontent.com/baraaksayeth/python-data-analytics/main/datasets/retail.txt', sep='\t')
df_txt.head()

### File XML

In [None]:
# Membaca file XML
df_xml = pd.read_xml('https://raw.githubusercontent.com/baraaksayeth/python-data-analytics/main/datasets/retail.xml')
df_xml.head()

In [None]:
# Membaca file xml yang bercabang

import xml.etree.ElementTree as ET
import requests

url = 'https://raw.githubusercontent.com/baraaksayeth/python-data-analytics/main/datasets/data.xml'
response = requests.get(url)

root = ET.fromstring(response.content)

data_karyawan = []

for dept in root.findall('departemen'):
  for karyawan in dept.findall('karyawan'):
    id = karyawan.get('id')
    nama = karyawan.findtext('nama')
    jabatan = karyawan.findtext('jabatan')
    umur = karyawan.findtext('umur')
    kontak = karyawan.find('kontak')
    email = kontak.findtext('email')
    telepon = kontak.findtext('telepon')

    data_karyawan.append({
      'id': id,
      'nama': nama,
      'umur': umur,
      'jabatan': jabatan,
      'email': email,
      'telepon': telepon
    })

df_xml = pd.DataFrame(data_karyawan)
df_xml.head()

## Mengambil Data dari Database

### Instalasi library

In [1]:
# Mengambil data dari database

!pip install pymysql sqlalchemy

Collecting pymysql
  Downloading pymysql-1.1.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pymysql-1.1.2-py3-none-any.whl (45 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.1.2


### Membuat Engine Koneksi

In [2]:
from sqlalchemy import create_engine

# mysql+pymsql://user:password@host:port/database

engine = create_engine('mysql+pymysql://sales_user:syzlau#123@103.163.139.132:3306/sales')

### Mengambil Data

In [5]:
sql = "SELECT * FROM retail_data"

df_sql = pd.read_sql(sql, engine)
df_sql.head()

OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on '103.163.139.132' (timed out)")
(Background on this error at: https://sqlalche.me/e/20/e3q8)

## Basic Data Manipulation

### Sorting

In [None]:
# Sorting kolom
df_sorted = df.sort_values(by='Date', ascending=False)

# Sorting multi columns
df_sorted = df.sort_values(by=['Date', 'CustomerID'], ascending=[False, False])
df_sorted.head()

### Sorting dengan menggunakan lambda

In [None]:
df_sorted = df.sort_values(by='Product', key=lambda x: x.str.len(), ascending=False)
df_sorted.head()

### Replace

In [None]:
# Replace Data
df['Location'] = df['Location'].replace('Jakarta', 'DKI Jakarta')

# Mengganti nilai None dengan 0
df['Quantity'] = df['Quantity'].fillna(0)

df.head(15)

In [None]:
# Replace banyak nilai sekaligus
df['PaymentMethod'] = df['PaymentMethod'].replace({
    'Credit Card': 'CC',
    'E-Wallet': 'EW'
})

In [None]:
# Replace Menggunakan Regex
df['Category'] = df['Category'].replace(r'[^a-zA-Z0-9]', '', regex=True)

In [None]:
# Replace Menggunakan Lambda

df['StatusDiscount'] = df['Discount'].apply(
  lambda x: 'Tanpa Diskon' if x == 0 else 'Menggunakan Diskon'
)

df.head()

### Drop Data

In [None]:
# Hapus Kolom
df.drop('Age', axis=1, inplace=True)
df.head()

In [None]:
# Hapus Baris
df.drop(3, axis=0, inplace=True)
df.head()

In [None]:
# Cek data mana aja yang duplikat
dupes = df[df.duplicated(subset=['CustomerID', 'Product'])]
dupes.head()

In [None]:
# Menghapus Duplikasi
df.drop_duplicates(inplace=True)

# Menghapus Duplikasi berdasarkan Kolom
df.drop_duplicates(subset=['Date', 'CustomerID',], inplace=True, keep='last')

### Filtering Data

In [None]:
# Filtering

# Pelanggan yang mendapatkan diskon lebih dari 10% dan Total lebih dari 5
df[(df['Discount'] > 10) & (df['Total'] > 100)]

# Semua transaksi oleh pelanggan berusia di bawah 30 dan lokasi di Surabaya atau Bandung
df[ (df['Age'] < 30) & (df['Location'].isin(['Surabayar', 'Bandung'])) ]

# Mencari data produk yang mengandung sebuah kata
df[df['Product'].str.contains('Milk', case=False)]

# Mencar data produk yang mengandung lebih dari 10 karakter
df[ df['Product'].str.len() > 10 ]

# Mengambil TOP N (baris dengan nilai tertinggi)
df.nlargest(3, 'Total')

In [None]:
# Mengambil pembelian yang total nya lebih dari rata-rata

mean_total = df['Total'].mean()
print(mean_total)

df[ df['Total'] > mean_total ]

In [None]:
# Hitung total penjualan per tanggal

print(df.shape)

df['Date'] = pd.to_datetime(df['Date'])

penjualan_per_hari = df.groupby('Date')['Total'].sum().reset_index()

print(penjualan_per_hari)

### Joining Dataframes

In [None]:
# Menggabungkan 2 dataset dengan kolom
df_customer = pd.read_csv('datasets/customer.csv')

df_customer.head()

In [None]:
df_lengkap = pd.merge(df, df_customer, on='CustomerID', how='inner')
df_lengkap.head()

In [None]:
transaksi_per_customer = df_lengkap.groupby('CustomerID').size().reset_index(name='TotalTransaksi')
transaksi_per_customer.head()

result = pd.merge(transaksi_per_customer, df_customer, on='CustomerID', how='inner')
result = result[['CustomerID', 'Name', 'TotalTransaksi']]
result.head()

In [None]:
transaksi_per_customer = df_lengkap.groupby('CustomerID').agg(
  TotalTransaksi=('InvoiceNo', 'count'),
  TotalSpent=('Total', 'sum')
).reset_index()

transaksi_per_customer.head()

In [None]:
# Membuat resume transaksi per tanggal
summary_per_tanggal = df_lengkap.groupby('Date').agg(
  TotalCustomer=('CustomerID', 'nunique'),
  TotalSpent=('Total', 'sum')
).reset_index()

summary_per_tanggal.head()