### Pandas library
Link to documentation: https://pandas.pydata.org/docs/getting_started/intro_tutorials/01_table_oriented.html

### 1. import required modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_formats=['svg']
from IPython.display import Image

### 2. Series in Pandas

In [6]:
# Define a name series
name_series = pd.Series(["Pejman", "Paris", "Frank", "Sara"])

# Define a color series
color_series = pd.Series(["Green", "Blue", "Red", "White"])

color_series

# Get more information
color_series.ndim, color_series.size, color_series.dtype, type(color_series), color_series.shape

(1, 4, dtype('O'), pandas.core.series.Series, (4,))

### 2. DataFrame in Pandas

In [12]:
# A simple way to make DataFrame
simple_df = pd.DataFrame(data={
    "Names":name_series,
    "Colors":color_series,
    "Scores": [1, 2, 3, 4],
    "Cars": pd.Series(["Benz", "Kia", "BMW", "Tesla"])
})

simple_df

# Custom datasets
custom_df = pd.DataFrame(data=np.random.rand(100, 4), columns=["A", "B", "C", "D"])
custom_df.head(10)
len(custom_df)

100

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### 3. Work with DataFrame (CSV data)
* link to data in koggle: https://www.kaggle.com/datasets/prathamtripathi/drug-classification
* link to data in GitHub: https://raw.githubusercontent.com/arad1367/WAC_November-2023/main/drug200.csv

In [19]:
data = "https://raw.githubusercontent.com/arad1367/WAC_November-2023/main/drug200.csv"

df = pd.read_csv(data)

df.head()

df.tail()

len(df)

df.ndim, df.size, df.shape


(2, 1200, (200, 6))

In [17]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [24]:
# Get more information about our DataFrame --> df
# df.info()

# df.describe()

# df.isna().sum()

df["Drug"].value_counts()

Drug
DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64

### 4. Filter data

In [25]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [29]:
# Filter by age
df_age_filter = df[df["Age"] > 35]
print(f"len of original df is: {len(df)}")
print(f"len of age filtere df is: {len(df_age_filter)}")

# Filter by High risk of Chol
df_high = df[df['Cholesterol'] == "HIGH"]
print(f"Len of High chol patient df: {len(df_high)}")

len of original df is: 200
len of age filtere df is: 134
Len of High chol patient df: 103


### 5. Data preprocessing

* Check mising values
* Encode categorical variables into numerical values: https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd

In [43]:
# Read a missing value dataset
new_data_path = "/content/drug200_missing.csv"

df_missing = pd.read_csv(new_data_path)
df_missing.head()

df_missing.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    3
Na_to_K        4
Drug           0
dtype: int64

In [37]:
# Solution 1 --> fillna()
# str
df_missing['Cholesterol'] = df_missing['Cholesterol'].fillna("missing", inplace=False)
df_missing.isna().sum()

# float
df_missing['Na_to_K'] = df_missing['Na_to_K'].fillna(df_missing['Na_to_K'].median(), inplace=False)
df_missing.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [44]:
# Solution 2 --> dropna()
df_without_missing = df_missing.dropna(inplace=False)
len(df_missing), len(df_without_missing)

(200, 193)

In [45]:
# Encode categorical variables into numerical values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [46]:
# Turn categorical variables to numeric format
df_missing['Sex'] = le.fit_transform(df_missing['Sex'])
df_missing['BP'] = le.fit_transform(df_missing['BP'])
df_missing['Cholesterol'] = le.fit_transform(df_missing['Cholesterol'])
df_missing['Drug'] = le.fit_transform(df_missing['Drug'])

df_missing.tail(10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
190,58,1,0,0,18.991,0
191,23,1,0,0,8.011,1
192,72,1,1,0,16.31,0
193,72,1,1,0,6.769,3
194,46,0,0,0,34.686,0
195,56,0,1,0,11.567,3
196,16,1,1,0,12.006,3
197,52,1,2,0,9.894,4
198,23,1,2,1,14.02,4
199,40,0,1,1,11.349,4


In [47]:
le.classes_

array(['DrugY', 'drugA', 'drugB', 'drugC', 'drugX'], dtype=object)

### Homework: Define a function that can return a dictionary and inside of this dictionary you can see labels and values

### 6. Make a new column

In [48]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [49]:
df["Age plus 10"] = df['Age'] + 10
df.tail(6)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Age plus 10
194,46,F,HIGH,HIGH,34.686,DrugY,56
195,56,F,LOW,HIGH,11.567,drugC,66
196,16,M,LOW,HIGH,12.006,drugC,26
197,52,M,NORMAL,HIGH,9.894,drugX,62
198,23,M,NORMAL,NORMAL,14.02,drugX,33
199,40,F,LOW,NORMAL,11.349,drugX,50


### 7. Shuffle data

In [51]:
df_shuffled = df.sample(frac=1)
df_shuffled.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Age plus 10
68,54,M,NORMAL,HIGH,24.658,DrugY,64
118,32,F,HIGH,NORMAL,10.292,drugA,42
83,38,F,HIGH,NORMAL,11.326,drugA,48
75,26,M,LOW,NORMAL,20.909,DrugY,36
106,22,M,NORMAL,HIGH,11.953,drugX,32


### 8. Save and load data

In [53]:
# Save data
# df_shuffled.to_csv("my_new_dataset", index=False)

# load data
my_new_data = pd.read_csv("/content/my_new_dataset")
my_new_data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Age plus 10
0,54,M,NORMAL,HIGH,24.658,DrugY,64
1,32,F,HIGH,NORMAL,10.292,drugA,42
2,38,F,HIGH,NORMAL,11.326,drugA,48
3,26,M,LOW,NORMAL,20.909,DrugY,36
4,22,M,NORMAL,HIGH,11.953,drugX,32


### Polars library
https://docs.pola.rs/user-guide/getting-started/

In [54]:
!pip install polars



In [55]:
import polars as pl

In [56]:
df_polar = pl.read_csv("/content/my_new_dataset")
print(df_polar.head())

shape: (5, 7)
┌─────┬─────┬────────┬─────────────┬─────────┬───────┬─────────────┐
│ Age ┆ Sex ┆ BP     ┆ Cholesterol ┆ Na_to_K ┆ Drug  ┆ Age plus 10 │
│ --- ┆ --- ┆ ---    ┆ ---         ┆ ---     ┆ ---   ┆ ---         │
│ i64 ┆ str ┆ str    ┆ str         ┆ f64     ┆ str   ┆ i64         │
╞═════╪═════╪════════╪═════════════╪═════════╪═══════╪═════════════╡
│ 54  ┆ M   ┆ NORMAL ┆ HIGH        ┆ 24.658  ┆ DrugY ┆ 64          │
│ 32  ┆ F   ┆ HIGH   ┆ NORMAL      ┆ 10.292  ┆ drugA ┆ 42          │
│ 38  ┆ F   ┆ HIGH   ┆ NORMAL      ┆ 11.326  ┆ drugA ┆ 48          │
│ 26  ┆ M   ┆ LOW    ┆ NORMAL      ┆ 20.909  ┆ DrugY ┆ 36          │
│ 22  ┆ M   ┆ NORMAL ┆ HIGH        ┆ 11.953  ┆ drugX ┆ 32          │
└─────┴─────┴────────┴─────────────┴─────────┴───────┴─────────────┘
