In [378]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


In [380]:
data = pd.read_csv(r"C:\Users\Chet\Documents\Messy data set.csv")


In [388]:
data.head(8)

Unnamed: 0,ID,Name,Age,Email,JoinDate,Salary
0,1,Alice,25,alice@example.com,2020-01-01,50000
1,2,Bob,30,bob@example.com,not a date,60000
2,2,Bob,30,bob@example.com,2020-02-01,60000
3,4,Charlie,,charlie@,2020-03-01,70000
4,5,David,Twenty-two,,2020-04-01,
5,6,Eve,45,eve@example,2020-05-01,90000
6,7,Frank,50,frank@@example.com,2020-06-01,100000
7,8,Grace,55,grace@example.com,2020-07-01,one lakh


####  Identify Missing Values

In [390]:
missing_data = data.isnull().sum()
print("Missing Data")
print(missing_data)


Missing Data
ID          3
Name        4
Age         4
Email       4
JoinDate    3
Salary      4
dtype: int64


#### Imputing Missing values

In [403]:
imputer = SimpleImputer(strategy='mean')


In [405]:
data[['column1', 'column2']] = imputer.fit_transform(data[['column1', 'column2']])


In [407]:
print(data)

    column1  column2
0  1.000000      6.0
1  2.000000      5.0
2  2.333333      6.0
3  4.000000      7.0


#### Checking datatype

In [410]:
print(data.dtypes)           


column1    float64
column2    float64
dtype: object


#### Removing Value

In [443]:
unique_values = data['Age'].unique()


In [445]:
print("Unique values in 'Age':", unique_values)


Unique values in 'Age': [25 30 40]


#### Assigning data type

In [447]:
data = pd.DataFrame({'Age': [25, 30, 35, 40]})


In [449]:
data['Age'] = data['Age'].astype(int)
print(data)

   Age
0   25
1   30
2   35
3   40


####  Dealing with duplicates

In [495]:
data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Bob'],
    'salary': [50000, 60000, 70000, 60000]
})

In [497]:
duplicates = data.duplicated()
print(data[duplicates]['salary'])

3    60000
Name: salary, dtype: int64


In [477]:
data.drop_duplicates(inplace=True)
data

Unnamed: 0,name,salary
0,Alice,50000
1,Bob,60000
2,Charlie,70000


####  Sorting data

In [469]:
sorted_data = data.sort_values(by='age', ascending=False)
print(data)


      name  age
0    Alice   25
1      Bob   30
2  Charlie   20
3    Grace   55


#### Boolean filtering

In [473]:
data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'salary': [50000, 60000, 70000]
})

In [475]:
filtered_data = data[data['salary'] > 50000]
filtered_data


Unnamed: 0,name,salary
1,Bob,60000
2,Charlie,70000


#### Is, In

In [221]:
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie','Frank'],
    'Salary': ['High', 'Low', 'Medium','Large']
})

In [225]:
filtered_data = data[data['Salary'].isin(['High', 'Medium','Large'])]
filtered_data

Unnamed: 0,Name,Salary
0,Alice,High
2,Charlie,Medium
3,Frank,Large


####  Loc and Iloc

In [501]:
data

Unnamed: 0,name,salary
0,Alice,50000
1,Bob,60000
2,Charlie,70000
3,Bob,60000


In [551]:
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
    'Salary': [50000, 60000, 50000, 70000, 60000]})

In [553]:
data.loc[data['Salary'] > 50000, ['Name', 'Salary']]
print(data)


      Name  Salary
0    Alice   50000
1      Bob   60000
2    Alice   50000
3  Charlie   70000
4      Bob   60000


In [555]:
data.iloc[0:4, 1:3]
print(subset)


   Salary
0   50000
1   60000
2   50000
3   70000


####  Combining and condition

In [584]:
data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 35, 45, 28],
    'Salary': [50000, 60000, 70000, 48000]})

In [590]:
filtered_data = data[(data['Age'] > 30) | (data['Salary'] < 50000)]
filtered_data

Unnamed: 0,Name,Age,Salary
1,Bob,35,60000
2,Charlie,45,70000
3,Diana,28,48000


In [592]:
filtered_data = data[(data['Age'] > 20) & (data['Salary'] < 60000)]
filtered_data


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
3,Diana,28,48000
