## pandas
- Used for EDA
- used for data cleaning, etc
- importing/exporting data, creating/deleting columns,

#### Topics:
- Series: difference between pandas **Series** and pandas **Dataframe**, access series, operation on series
- Dataframe: data collection->read from CSV, data access ( loc, iloc )
- statitical operation: mean, etc

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print(pd.__version__)
data_file = 'data.csv'

2.2.0


# Series
## Difference between pandas Series and pandas Dataframe
```
Feature	        Series              DataFrame
Dimensions	      1D                  2D
Shape             (n,)             (rows, columns)
Data Structure    Single column   Table with multiple columns
Index             Yes               Yes (rows & columns)
Usage         Single column or row   Full dataset
```

In [81]:
# Series declaration

s = pd.Series([10, 20, 30, 40]) # From a List
print(type(s)) # <class 'pandas.core.series.Series'>
print(s)

print("#####################")
s = pd.Series([10, 20, 30], index=['a', 'b', 'c']) # From a List with Custom Index
print(s)

print("$$$$$$$$$$$$$$$$$$$$$$$$$")
data = {'Alice': 25, 'Bob': 30, 'Charlie': 35} #  From a Dictionary
s = pd.Series(data)
print(s)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
s = pd.Series(np.random.randint(10,20,5))
print(s)

print("**********************")
series_age = df['Age']
print(type(series_age)) # <class 'pandas.core.series.Series'>
print(series_age)

<class 'pandas.core.series.Series'>
0    10
1    20
2    30
3    40
dtype: int64
#####################
a    10
b    20
c    30
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$
Alice      25
Bob        30
Charlie    35
dtype: int64
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0    19
1    14
2    15
3    19
4    17
dtype: int32
**********************
<class 'pandas.core.series.Series'>
0     25
1     30
2     35
3     40
4     22
5     28
6     32
7     26
8     26
9     26
10    23
Name: Age, dtype: int64


In [82]:
# access series
s = pd.Series([10, 20, 30, 40, 50])
print(s[0])
print(s[:3])
print(s[2:])
print(s[1:3])


10
0    10
1    20
2    30
dtype: int64
2    30
3    40
4    50
dtype: int64
1    20
2    30
dtype: int64


### operation on series

In [83]:
# 1. Arithmetic Operations
s = pd.Series([10, 20, 30])

print(s + 5)       # Add 5 to each element
print(s * 2)       # Multiply each element by 2
print(s / 10)      # Divide each element by 10

# 2. Statistical Operations
s = pd.Series([10, 20, 30, 40])

print(s.mean())      # Average
print(s.median())    # Median
print(s.std())       # Standard deviation
print(s.max())       # Maximum
print(s.min())       # Minimum
print(s.sum())       # Sum of all elements
print(s.cumsum())
print(s.cumprod())
print(s.describe())
print(s.quantile(0.25))  # 25th percentile

print(s.sem())  # Standard error mean
print(s.nunique())  # Unique count

print(s.value_counts())  # Value counts (frequency of each unique value)

print(s.idxmin())  # Index of first min value
print(s.idxmax())  # Index of first max value



# 3. Element-wise Functions
s = pd.Series([1, 2, 3, 4])
print(np.sqrt(s))      # Square root
print(np.exp(s))       # Exponential
print(np.log(s))       # Logarithm

# 4. Boolean Filtering
s = pd.Series([10, 20, 30, 40])
print(s>25)
print(s[s > 25])       # Filter elements greater than 25

# 5. Value Counts & Uniqueness
s = pd.Series(['apple', 'banana', 'apple', 'orange'])
print(s.value_counts())   # Frequency of unique values
print(s.unique())         # Unique values

# 6. Mapping / Transformation
s = pd.Series([1, 2, 3])
print(s.map(lambda x: x * 10))  # Apply a function to each element

# 7. String Operations (for string Series)
s = pd.Series(['hello', 'my friend'])
print(s.str.upper())     # Convert to uppercase
print(s.str.len())       # Length of each string

# 8. Handling Missing Data
s = pd.Series([1, 2, None, 4])
print(s.isnull())        # Check for NaNs
print(s.fillna(0))       # Replace NaNs with 0
print(s.dropna())        # Drop NaNs

# 9. Sorting
s = pd.Series([10, 2, 30])
print(s.sort_values())   # Sort by value
print(s.sort_index())    # Sort by index

# 10. Combine / Arithmetic Between Series
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['a', 'b', 'c'])
print(s1 + s2)   # Aligns by index and adds

0    15
1    25
2    35
dtype: int64
0    20
1    40
2    60
dtype: int64
0    1.0
1    2.0
2    3.0
dtype: float64
25.0
25.0
12.909944487358056
40
10
100
0     10
1     30
2     60
3    100
dtype: int64
0        10
1       200
2      6000
3    240000
dtype: int64
count     4.000000
mean     25.000000
std      12.909944
min      10.000000
25%      17.500000
50%      25.000000
75%      32.500000
max      40.000000
dtype: float64
17.5
6.454972243679028
4
10    1
20    1
30    1
40    1
Name: count, dtype: int64
0
3
0    1.000000
1    1.414214
2    1.732051
3    2.000000
dtype: float64
0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64
0    0.000000
1    0.693147
2    1.098612
3    1.386294
dtype: float64
0    False
1    False
2     True
3     True
dtype: bool
2    30
3    40
dtype: int64
apple     2
banana    1
orange    1
Name: count, dtype: int64
['apple' 'banana' 'orange']
0    10
1    20
2    30
dtype: int64
0        HELLO
1    MY FRIEND
dtype: object
0    5
1

## Dataframe

In [84]:
# Create a DataFrame: hardcode the data
data = {
    'A': pd.Series(pd.date_range("2023-01-01", periods=5, freq='D')),  # Time data
    'B': pd.Series([120.5, 123.0, 121.3, 125.6, 124.2]),
    'C': pd.Series(['Buy', 'Sell', 'Hold', 'Buy', 'Sell']),
    'D': pd.Series(np.random.randint(1, 11, size=5))
}
df = pd.DataFrame(data)

print(df)

           A      B     C   D
0 2023-01-01  120.5   Buy   1
1 2023-01-02  123.0  Sell   8
2 2023-01-03  121.3  Hold   8
3 2023-01-04  125.6   Buy  10
4 2023-01-05  124.2  Sell   8


In [85]:

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen', 'Helen', 'Helen', 'Jerry'],
    'Age': [25, 30, 35, 40, 22, 28, 32, 26, 26, 26, 23],
    'City': ['Chicago', 'Los Angeles', 'Chicago', 'Houston', 'Houston', np.nan, 'San Antonio', 'San Diego', 'San Diego', 'San Diego', 'Phoenix'],
    'Experience': [2, 5, 7, 10, 1, 3, 6, 2, 2, 2, 6],
    'Experience2': [2, 5, 7, 10, 1, 3, 6, 2, 2, 2, 6],
    'Salary': [70000.0, 80000.0, np.nan, 90000.0, 48000.0, 72000.0, 85000.0, 62000.0, 62000.0, 62000.0, 78000.0]
}

df = pd.DataFrame(data)
print(df)
df.to_csv(data_file, index=False)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [86]:
# Or Read from CSV with comma seperator
from pathlib import Path
data_file = Path.cwd() / 'data.csv'

df = pd.read_csv(data_file)

print("\nDataFrame1:\n", df)
# print(f"\nDataFrame1:\n{}")


DataFrame1:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


## loc, iloc
- loc - Label-based selection
- .iloc[] – Integer position-based selection

In [87]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 22],
    'City': ['Chicago', 'LA', 'Chicago', 'Houston', 'Houston']
}

df = pd.DataFrame(data, index=['a', 'b', 'c', 'd', 'e'])
print(df)
#############################################
# 1. Access a single row by label:
df.loc['b']

# 2. Access multiple rows by labels:
df.loc[['a', 'c', 'e']]

# 3. Access specific rows and columns:
df.loc[['a', 'b'], ['Name', 'City']]

# 4. Filter rows with a condition:
df.loc[df['Age'] > 30]
df.loc[df['Age'] > 30 , ['Name', 'City']]
###########################################
# 1. Access a single row by index:
df.iloc[1]  # second row

# 2. Access a specific cell (row 1, column 2):
df.iloc[1, 2]  # LA

# 3. Access multiple rows and columns:
df.iloc[0:3, 0:2]  # First 3 rows, first 2 columns

# 4. Modify a value:
df.iloc[0, 1] = 26  # Change Alice's age from 25 to 26


      Name  Age     City
a    Alice   25  Chicago
b      Bob   30       LA
c  Charlie   35  Chicago
d    David   40  Houston
e      Eva   22  Houston


## statistical operations
- mean , 

In [88]:
df = pd.read_csv('data.csv')
print("\nDataFrame:\n", df)

print(df['Age'].mean())
print(df['Age'].sum())

# print(df.mean()) # # ERROR because of string columns
print(df.mean(numeric_only=True))
print(df.sum(numeric_only=True))

print(df.describe())


DataFrame:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0
28.454545454545453
313
Age               28.454545
Experience         4.181818
Experience2        4.181818
Salary         70900.000000
dtype: float64
Age               313.0
Experience         46.0
Experience2        46

In [89]:
print(df.shape) # (rows,columns)

(11, 6)


## modify values, handle missing values, duplicates

In [90]:
# change values
df = pd.read_csv('data.csv')
df.loc[1, 'Salary'] = 30  # Set a value in Salary for 2nd entry
df.loc[2, 'City'] = "delhi"  # Set a value in City for 3rd entry
print("\nDataFrame2:\n", df)


DataFrame2:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5     30.0
2   Charlie   35        delhi           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [91]:
# Applying .map() to increase salary by 10%
df['Salary_increase'] = df['Salary'].map(lambda x: x * 1.1)
print("\nAfter salary increase:\n",df)


After salary increase:
        Name  Age         City  Experience  Experience2   Salary  \
0     Alice   25      Chicago           2            2  70000.0   
1       Bob   30  Los Angeles           5            5     30.0   
2   Charlie   35        delhi           7            7      NaN   
3     David   40      Houston          10           10  90000.0   
4       Eva   22      Houston           1            1  48000.0   
5     Frank   28          NaN           3            3  72000.0   
6     Grace   32  San Antonio           6            6  85000.0   
7     Helen   26    San Diego           2            2  62000.0   
8     Helen   26    San Diego           2            2  62000.0   
9     Helen   26    San Diego           2            2  62000.0   
10    Jerry   23      Phoenix           6            6  78000.0   

    Salary_increase  
0           77000.0  
1              33.0  
2               NaN  
3           99000.0  
4           52800.0  
5           79200.0  
6           9350

In [92]:
# Lets drop the above 'Salary_increase' column
df.drop(columns=['Salary_increase','Experience2'], inplace=True)
print("\nAfter dropping a column:\n",df)


After dropping a column:
        Name  Age         City  Experience   Salary
0     Alice   25      Chicago           2  70000.0
1       Bob   30  Los Angeles           5     30.0
2   Charlie   35        delhi           7      NaN
3     David   40      Houston          10  90000.0
4       Eva   22      Houston           1  48000.0
5     Frank   28          NaN           3  72000.0
6     Grace   32  San Antonio           6  85000.0
7     Helen   26    San Diego           2  62000.0
8     Helen   26    San Diego           2  62000.0
9     Helen   26    San Diego           2  62000.0
10    Jerry   23      Phoenix           6  78000.0


In [93]:
# Handling Missing Values
df_missing_value = df.isnull()
print(df_missing_value)

missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values )

     Name    Age   City  Experience  Salary
0   False  False  False       False   False
1   False  False  False       False   False
2   False  False  False       False    True
3   False  False  False       False   False
4   False  False  False       False   False
5   False  False   True       False   False
6   False  False  False       False   False
7   False  False  False       False   False
8   False  False  False       False   False
9   False  False  False       False   False
10  False  False  False       False   False

Missing Values:
 Name          0
Age           0
City          1
Experience    0
Salary        1
dtype: int64


In [94]:
df = pd.read_csv('data.csv')

df_clean = df.dropna() #  Drop Rows with Any Missing Values

df_clean = df.dropna(axis=1) # Drop Columns with Any Missing Values

df_clean = df.dropna(how='all') # Drop Rows Only If All Values Are Missing

df_clean = df.dropna(subset=['Salary']) # Drop Rows Where Specific Column Has Missing Value

print(df)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [105]:
df = pd.read_csv('data.csv')
print(df)
df = df.fillna(0)
print("\nDataFrame after filling missing values:\n", df)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0

DataFrame after filling missing values:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.

In [96]:
df = pd.read_csv('data.csv')
print(df)
df = df.fillna(method='ffill')
print("\nDataFrame after filling missing values:\n", df)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0

DataFrame after filling missing values:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.

  df = df.fillna(method='ffill')


In [97]:
df = pd.read_csv('data.csv')
print(df)
df = df.fillna(method='bfill')
print("\nDataFrame after filling missing values:\n", df)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0

DataFrame after filling missing values:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.

  df = df.fillna(method='bfill')


In [107]:
df = pd.read_csv('data.csv')
print(df)
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
print("\nDataFrame after filling missing values:\n", df)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0

DataFrame after filling missing values:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.

In [98]:
df = pd.read_csv('data.csv')

df = df.fillna({'Salary': df['Salary'].median(),
                'City': 'Unknown'
})
print("\nDataFrame after filling missing values:\n", df)


DataFrame after filling missing values:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7  71000.0
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28      Unknown           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [99]:
# Finding and Handling Duplicate Records
duplicates = df.duplicated()
print("\nDuplicate Records:\n", df[duplicates])


Duplicate Records:
     Name  Age       City  Experience  Experience2   Salary
8  Helen   26  San Diego           2            2  62000.0
9  Helen   26  San Diego           2            2  62000.0


In [100]:
# Removing duplicates
df = df.drop_duplicates()
print("\nDataFrame after removing duplicates:\n", df)


DataFrame after removing duplicates:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7  71000.0
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28      Unknown           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [101]:
#Save the clean data to CSV file
df.to_csv("data_clean.csv")

In [102]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
label_encoder = LabelEncoder()
df['City_encoded'] = label_encoder.fit_transform(df['City'])

print("\nDataFrame after encoding City column:\n", df)



DataFrame after encoding City column:
        Name  Age         City  Experience  Experience2   Salary  City_encoded
0     Alice   25      Chicago           2            2  70000.0             0
1       Bob   30  Los Angeles           5            5  80000.0             2
2   Charlie   35      Chicago           7            7  71000.0             0
3     David   40      Houston          10           10  90000.0             1
4       Eva   22      Houston           1            1  48000.0             1
5     Frank   28      Unknown           3            3  72000.0             6
6     Grace   32  San Antonio           6            6  85000.0             4
7     Helen   26    San Diego           2            2  62000.0             5
10    Jerry   23      Phoenix           6            6  78000.0             3


In [103]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Feature Scaling
scaler = StandardScaler()
df[['Age', 'Salary', 'Experience']] = scaler.fit_transform(df[['Age', 'Salary', 'Experience']])
print("\nDataFrame after scaling:\n", df)



DataFrame after scaling:
        Name       Age         City  Experience  Experience2    Salary  \
0     Alice -0.719712      Chicago   -0.970143            2 -0.243491   
1       Bob  0.179928  Los Angeles    0.121268            5  0.599362   
2   Charlie  1.079568      Chicago    0.848875            7 -0.159206   
3     David  1.979208      Houston    1.940285           10  1.442216   
4       Eva -1.259496      Houston   -1.333946            1 -2.097768   
5     Frank -0.179928      Unknown   -0.606339            3 -0.074920   
6     Grace  0.539784  San Antonio    0.485071            6  1.020789   
7     Helen -0.539784    San Diego   -0.970143            2 -0.917774   
10    Jerry -1.079568      Phoenix    0.485071            6  0.430792   

    City_encoded  
0              0  
1              2  
2              0  
3              1  
4              1  
5              6  
6              4  
7              5  
10             3  


In [104]:
# Splitting Data for Training and Testing
from sklearn.model_selection import train_test_split

X = df[['Age', 'Experience', 'City_encoded']]
y = df['Salary']  # Assuming salary is the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(f"X_train shape: {X_train.shape}")
print(f"X_test  shape: {X_test.shape}")

print("###############################")

print(f"\nTraining Features:\n{X_train}")
print(f"\nTesting Features:\n{X_test}")

X_train shape: (7, 3)
X_test  shape: (2, 3)
###############################

Training Features:
         Age  Experience  City_encoded
6   0.539784    0.485071             4
2   1.079568    0.848875             0
10 -1.079568    0.485071             3
0  -0.719712   -0.970143             0
1   0.179928    0.121268             2
5  -0.179928   -0.606339             6
7  -0.539784   -0.970143             5

Testing Features:
        Age  Experience  City_encoded
3  1.979208    1.940285             1
4 -1.259496   -1.333946             1
