## pandas

#### Topics:

- label encoding, etc

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print(pd.__version__)

2.2.0


In [57]:
df = pd.read_csv('data_clean.csv')
df = df.drop(['Experience2',], axis=1)
print(df)

      Name  Age         City  Experience  Salary
0    Alice   25      Chicago           2   70000
1      Bob   30  Los Angeles           5   80000
2  Charlie   35      Chicago           7   71000
3    David   40      Houston          10   90000
4      Eva   22      Houston           1   48000
5    Frank   28      Unknown           3   72000
6    Grace   32  San Antonio           6   85000
7    Helen   26    San Diego           2   62000
8    Jerry   23      Phoenix           6   78000


In [58]:
# different types of feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer

numerical_cols = ['Age', 'Experience', 'Salary']

scaler = StandardScaler()
std_scalar_data = scaler.fit_transform(df[numerical_cols])
std_scalar_df = pd.DataFrame(std_scalar_data, columns=numerical_cols)
print("\nDataFrame after standard scaling:\n", std_scalar_df)


scaler = MinMaxScaler()
minmax_scalar_data = scaler.fit_transform(df[numerical_cols])
minmax_scalar_df = pd.DataFrame(minmax_scalar_data, columns=numerical_cols)
print("\nDataFrame after minmax scaling:\n", minmax_scalar_df)


scaler = RobustScaler()
robust_scalar_data = scaler.fit_transform(df[numerical_cols])
robust_scalar_df = pd.DataFrame(robust_scalar_data, columns=numerical_cols)
print("\nDataFrame after robust scaling:\n", robust_scalar_df)


scaler = MaxAbsScaler()
maxabs_scalar_data = scaler.fit_transform(df[numerical_cols])
maxabs_scalar_df = pd.DataFrame(maxabs_scalar_data, columns=numerical_cols)
print("\nDataFrame after maxabs scaling:\n", maxabs_scalar_df)


scaler = Normalizer()
normalizer_scalar_data = scaler.fit_transform(df[numerical_cols])
normalizer_scalar_df = pd.DataFrame(normalizer_scalar_data, columns=numerical_cols)
print("\nDataFrame after normalizer scaling:\n", normalizer_scalar_df)


DataFrame after standard scaling:
         Age  Experience    Salary
0 -0.719712   -0.970143 -0.243491
1  0.179928    0.121268  0.599362
2  1.079568    0.848875 -0.159206
3  1.979208    1.940285  1.442216
4 -1.259496   -1.333946 -2.097768
5 -0.179928   -0.606339 -0.074920
6  0.539784    0.485071  1.020789
7 -0.539784   -0.970143 -0.917774
8 -1.079568    0.485071  0.430792

DataFrame after minmax scaling:
         Age  Experience    Salary
0  0.166667    0.111111  0.523810
1  0.444444    0.444444  0.761905
2  0.722222    0.666667  0.547619
3  1.000000    1.000000  1.000000
4  0.000000    0.000000  0.000000
5  0.333333    0.222222  0.571429
6  0.555556    0.555556  0.880952
7  0.222222    0.111111  0.333333
8  0.055556    0.555556  0.714286

DataFrame after robust scaling:
         Age  Experience  Salary
0 -0.428571       -0.75    -0.2
1  0.285714        0.00     0.8
2  1.000000        0.50    -0.1
3  1.714286        1.25     1.8
4 -0.857143       -1.00    -2.4
5  0.000000       -0.50 

In [59]:
# Now add the other categorical columns
df = pd.concat([df[['Name', 'City']], normalizer_scalar_df], axis=1)
print(df)

      Name         City       Age  Experience  Salary
0    Alice      Chicago  0.000357    0.000029     1.0
1      Bob  Los Angeles  0.000375    0.000062     1.0
2  Charlie      Chicago  0.000493    0.000099     1.0
3    David      Houston  0.000444    0.000111     1.0
4      Eva      Houston  0.000458    0.000021     1.0
5    Frank      Unknown  0.000389    0.000042     1.0
6    Grace  San Antonio  0.000376    0.000071     1.0
7    Helen    San Diego  0.000419    0.000032     1.0
8    Jerry      Phoenix  0.000295    0.000077     1.0


In [60]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
label_encoder = LabelEncoder()
df['City_encoded'] = label_encoder.fit_transform(df['City'])

print("\nDataFrame after encoding City column:\n", df)


DataFrame after encoding City column:
       Name         City       Age  Experience  Salary  City_encoded
0    Alice      Chicago  0.000357    0.000029     1.0             0
1      Bob  Los Angeles  0.000375    0.000062     1.0             2
2  Charlie      Chicago  0.000493    0.000099     1.0             0
3    David      Houston  0.000444    0.000111     1.0             1
4      Eva      Houston  0.000458    0.000021     1.0             1
5    Frank      Unknown  0.000389    0.000042     1.0             6
6    Grace  San Antonio  0.000376    0.000071     1.0             4
7    Helen    San Diego  0.000419    0.000032     1.0             5
8    Jerry      Phoenix  0.000295    0.000077     1.0             3


In [61]:
# Splitting Data for Training and Testing
from sklearn.model_selection import train_test_split

X = df[['Age', 'Experience', 'City_encoded']]
y = df['Salary']  # Assuming salary is the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(f"X_train shape: {X_train.shape}")
print(f"X_test  shape: {X_test.shape}")

print("###############################")

print(f"\nTraining Features:\n{X_train}")
print(f"\nTesting Features:\n{X_test}")

X_train shape: (7, 3)
X_test  shape: (2, 3)
###############################

Training Features:
        Age  Experience  City_encoded
6  0.000376    0.000071             4
2  0.000493    0.000099             0
8  0.000295    0.000077             3
0  0.000357    0.000029             0
1  0.000375    0.000062             2
5  0.000389    0.000042             6
7  0.000419    0.000032             5

Testing Features:
        Age  Experience  City_encoded
3  0.000444    0.000111             1
4  0.000458    0.000021             1
