In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read from CSV
from pathlib import Path
data_file = Path.cwd() / 'data.csv'

df = pd.read_csv(data_file)

print("\nDataFrame1:\n", df)
# print(f"\nDataFrame1:\n{}")



DataFrame1:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [3]:
print(df.shape) # (rows,columns)

(11, 6)


In [4]:
# change values
df.loc[1, 'Salary'] = 30  # Set a value in Salary for 2nd entry
df.loc[2, 'City'] = "delhi"  # Set a value in City for 3rd entry
print("\nDataFrame2:\n", df)


DataFrame2:
        Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5     30.0
2   Charlie   35        delhi           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0


In [5]:
# Applying .map() to increase salary by 10%
df['Salary_increase'] = df['Salary'].map(lambda x: x * 1.1)
print("\nAfter salary increase:\n",df)


After salary increase:
        Name  Age         City  Experience  Experience2   Salary  \
0     Alice   25      Chicago           2            2  70000.0   
1       Bob   30  Los Angeles           5            5     30.0   
2   Charlie   35        delhi           7            7      NaN   
3     David   40      Houston          10           10  90000.0   
4       Eva   22      Houston           1            1  48000.0   
5     Frank   28          NaN           3            3  72000.0   
6     Grace   32  San Antonio           6            6  85000.0   
7     Helen   26    San Diego           2            2  62000.0   
8     Helen   26    San Diego           2            2  62000.0   
9     Helen   26    San Diego           2            2  62000.0   
10    Jerry   23      Phoenix           6            6  78000.0   

    Salary_increase  
0           77000.0  
1              33.0  
2               NaN  
3           99000.0  
4           52800.0  
5           79200.0  
6           9350

In [6]:
# Lets drop the above 'Salary_increase' column
df.drop(columns=['Salary_increase','Experience2'], inplace=True)
print("\nAfter dropping a column:\n",df)


After dropping a column:
        Name  Age         City  Experience   Salary
0     Alice   25      Chicago           2  70000.0
1       Bob   30  Los Angeles           5     30.0
2   Charlie   35        delhi           7      NaN
3     David   40      Houston          10  90000.0
4       Eva   22      Houston           1  48000.0
5     Frank   28          NaN           3  72000.0
6     Grace   32  San Antonio           6  85000.0
7     Helen   26    San Diego           2  62000.0
8     Helen   26    San Diego           2  62000.0
9     Helen   26    San Diego           2  62000.0
10    Jerry   23      Phoenix           6  78000.0


In [7]:
# Handling Missing Values

missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values )


Missing Values:
 Name          0
Age           0
City          1
Experience    0
Salary        1
dtype: int64


In [8]:
df = df.fillna({'Salary': df['Salary'].median(),
                'City': 'Unknown'
})
print("\nDataFrame after filling missing values:\n", df)


DataFrame after filling missing values:
        Name  Age         City  Experience   Salary
0     Alice   25      Chicago           2  70000.0
1       Bob   30  Los Angeles           5     30.0
2   Charlie   35        delhi           7  66000.0
3     David   40      Houston          10  90000.0
4       Eva   22      Houston           1  48000.0
5     Frank   28      Unknown           3  72000.0
6     Grace   32  San Antonio           6  85000.0
7     Helen   26    San Diego           2  62000.0
8     Helen   26    San Diego           2  62000.0
9     Helen   26    San Diego           2  62000.0
10    Jerry   23      Phoenix           6  78000.0


In [58]:
# Finding and Handling Duplicate Records
duplicates = df.duplicated()
print("\nDuplicate Records:\n", df[duplicates])


Duplicate Records:
     Name  Age       City   Salary  Experience
8  Helen   26  San Diego  62000.0           2
9  Helen   26  San Diego  62000.0           2


In [59]:
# Removing duplicates
df = df.drop_duplicates()
print("\nDataFrame after removing duplicates:\n", df)


DataFrame after removing duplicates:
        Name  Age         City   Salary  Experience
0     Alice   25      Chicago  70000.0           2
1       Bob   30  Los Angeles     30.0           5
2   Charlie   35        delhi  66000.0           7
3     David   40      Houston  90000.0          10
4       Eva   22      Houston  48000.0           1
5     Frank   28      Unknown  72000.0           3
6     Grace   32  San Antonio  85000.0           6
7     Helen   26    San Diego  62000.0           2
10    Jerry   23      Phoenix  78000.0           6


In [60]:
#Save the clean data to CSV file
df.to_csv("data_clean.csv")

In [9]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
label_encoder = LabelEncoder()
df['City_encoded'] = label_encoder.fit_transform(df['City'])

print("\nDataFrame after encoding City column:\n", df)



DataFrame after encoding City column:
        Name  Age         City  Experience   Salary  City_encoded
0     Alice   25      Chicago           2  70000.0             0
1       Bob   30  Los Angeles           5     30.0             2
2   Charlie   35        delhi           7  66000.0             7
3     David   40      Houston          10  90000.0             1
4       Eva   22      Houston           1  48000.0             1
5     Frank   28      Unknown           3  72000.0             6
6     Grace   32  San Antonio           6  85000.0             4
7     Helen   26    San Diego           2  62000.0             5
8     Helen   26    San Diego           2  62000.0             5
9     Helen   26    San Diego           2  62000.0             5
10    Jerry   23      Phoenix           6  78000.0             3


In [10]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Feature Scaling
scaler = StandardScaler()
df[['Age', 'Salary', 'Experience']] = scaler.fit_transform(df[['Age', 'Salary', 'Experience']])
print("\nDataFrame after scaling:\n", df)



DataFrame after scaling:
        Name       Age         City  Experience    Salary  City_encoded
0     Alice -0.669662      Chicago   -0.810885  0.297584             0
1       Bob  0.299586  Los Angeles    0.304082 -2.757521             2
2   Charlie  1.268833        delhi    1.047393  0.122931             7
3     David  2.238081      Houston    2.162360  1.170845             1
4       Eva -1.251211      Houston   -1.182540 -0.663004             1
5     Frank -0.088113      Unknown   -0.439229  0.384910             6
6     Grace  0.687285  San Antonio    0.675737  0.952530             4
7     Helen -0.475812    San Diego   -0.810885 -0.051721             5
8     Helen -0.475812    San Diego   -0.810885 -0.051721             5
9     Helen -0.475812    San Diego   -0.810885 -0.051721             5
10    Jerry -1.057361      Phoenix    0.675737  0.646888             3


In [13]:
# Splitting Data for Training and Testing
from sklearn.model_selection import train_test_split

X = df[['Age', 'Experience', 'City_encoded']]
y = df['Salary']  # Assuming salary is the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print(f"X_train shape: {X_train.shape}")
print(f"X_test  shape: {X_test.shape}")

print("###############################")

print(f"\nTraining Features:\n{X_train}")
print(f"\nTesting Features:\n{X_test}")

X_train shape: (8, 3)
X_test  shape: (3, 3)
###############################

Training Features:
         Age  Experience  City_encoded
9  -0.475812   -0.810885             5
2   1.268833    1.047393             7
6   0.687285    0.675737             4
0  -0.669662   -0.810885             0
1   0.299586    0.304082             2
5  -0.088113   -0.439229             6
7  -0.475812   -0.810885             5
10 -1.057361    0.675737             3

Testing Features:
        Age  Experience  City_encoded
3  2.238081    2.162360             1
8 -0.475812   -0.810885             5
4 -1.251211   -1.182540             1
