In [1]:
# use a T-test to calculate the statistic and p-value


import numpy as np
from scipy import stats

data =  np.array([78,74,75,80,77,73,79,76,74,74]) # sample score
mu_population =77 # hypothesized population mean
alpha =0.005 # significance level

#Step 2
t_stat, p_value = stats.ttest_1samp(data, mu_population)
# ttest_1samp wheteher the sample mean is statistically different from given population mean

# Step 3
print("Sample mean:", np.mean(data))
print("T-statistic:", t_stat)
print("P-value:", p_value)

#Step 4
#make decision based on p value
if p_value < alpha:
    print("Reject H0: The sample mean is significantly different from the population mean.")
else:
    print("Fail to reject H0: No significant difference from the population mean.")



Sample mean: 76.0
T-statistic: -1.3155870289605438
P-value: 0.22083801302732203
Fail to reject H0: No significant difference from the population mean.


In [2]:
import numpy as np
from scipy.stats import norm
# Step 1 
mu_population = 50 
sigma_population = 2
n=40
sample_mean = 49.2
alpha = 0.05

# Step 2 Z statistic
z_stat = (sample_mean - mu_population)/(sigma_population/ np.sqrt(n))

# step 3- calculate the p-value 
p_value = 2 *(1 - norm.cdf(abs(z_stat)))

# Step 4 
print("Z-statistic:", z_stat)
print("P-value:", p_value)

# Step 5
if p_value < alpha:
    print("Reject H0: The mean is significantly different from 50.")
else:
    print("Fail to reject H0: No significant difference from 50.")

Z-statistic: -2.529822128134694
P-value: 0.011412036386001967
Reject H0: The mean is significantly different from 50.


In [4]:
import pandas as pd

In [6]:
# Load the dataset from csv file
df = pd.read_csv(r"C:\Users\anokh\OneDrive\Desktop\veracity\messy_dataset_50.csv")
df.head(5)

Unnamed: 0,ID,Name,Age,City,Salary,Join Date,Department
0,19,Ivan,28,Chicago,52000,06-05-2025,
1,14,ALICE,22,los angeles,abc,05-05-2018,IT
2,4,Heidi,33,chicago,62000,10-09-2018,Finance
3,8,BOB,33,Chicago,58000,20-06-2019,Finance
4,3,frank,22,chicago,52000,,Finanace


In [7]:
# acess from last
df.tail()

Unnamed: 0,ID,Name,Age,City,Salary,Join Date,Department
45,19,Eve,35,chicago,abc,30-08-2020,Finance
46,9,,33,chicago,70000,15-12-2020,IT
47,8,Heidi,45,los angeles,52000,15-01-2020,Finance
48,15,ALICE,thirty,new york,50000,05-05-2018,IT
49,2,judy,,new york,60000,,Finance


In [8]:
# basic information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          50 non-null     int64 
 1   Name        47 non-null     object
 2   Age         47 non-null     object
 3   City        45 non-null     object
 4   Salary      44 non-null     object
 5   Join Date   47 non-null     object
 6   Department  47 non-null     object
dtypes: int64(1), object(6)
memory usage: 2.9+ KB
None


In [9]:
# Missing value
print(df.isnull().sum())

ID            0
Name          3
Age           3
City          5
Salary        6
Join Date     3
Department    3
dtype: int64


In [10]:
# Duplicate rows
print("\n Number of Duplicate Rows:", df.duplicated().sum())


 Number of Duplicate Rows: 0


In [11]:
# unique value 
print("\n Unique value in each column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}unique values")


 Unique value in each column:
ID: 20unique values
Name: 11unique values
Age: 9unique values
City: 7unique values
Salary: 12unique values
Join Date: 11unique values
Department: 4unique values


In [12]:
print(df.dtypes)

ID             int64
Name          object
Age           object
City          object
Salary        object
Join Date     object
Department    object
dtype: object


In [13]:
df.head(30)

Unnamed: 0,ID,Name,Age,City,Salary,Join Date,Department
0,19,Ivan,28,Chicago,52000,06-05-2025,
1,14,ALICE,22,los angeles,abc,05-05-2018,IT
2,4,Heidi,33,chicago,62000,10-09-2018,Finance
3,8,BOB,33,Chicago,58000,20-06-2019,Finance
4,3,frank,22,chicago,52000,,Finanace
5,4,,30,chicago,62000,05-05-2018,Finance
6,7,Eve,28,chicago,52000,23-07-2021,IT
7,10,Grace,33,New york,-1000,11-11-2019,Finanace
8,11,,45,,72000,20-06-2019,Finance
9,8,Ivan,,Chicago,abc,01-03-2021,Finance


In [14]:
# try to convert numeric columns age and salary to detect invalid entries
for col in ["Age", "Salary"]:
    invalid_entries = df[~df[col].astype(str).str.replace(".",'',1).str.isnumeric()][col]
    if not invalid_entries.empty:
        print(f"\nInvalid values found in {col}:", invalid_entries.tolist())



Invalid values found in Age: [nan, 'thirty', 'thirty', 'thirty', 'thirty', nan, 'thirty', nan]

Invalid values found in Salary: ['abc', '-1000', 'abc', 'abc', nan, nan, nan, nan, 'abc', nan, nan, '-1000', 'abc']


In [15]:
# Data Cleaning

In [16]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ","_")
print("\nColumns names Standardized")
df.columns


Columns names Standardized


Index(['id', 'name', 'age', 'city', 'salary', 'join_date', 'department'], dtype='object')

In [17]:
df.head(5)

Unnamed: 0,id,name,age,city,salary,join_date,department
0,19,Ivan,28,Chicago,52000,06-05-2025,
1,14,ALICE,22,los angeles,abc,05-05-2018,IT
2,4,Heidi,33,chicago,62000,10-09-2018,Finance
3,8,BOB,33,Chicago,58000,20-06-2019,Finance
4,3,frank,22,chicago,52000,,Finanace


In [18]:
# Missing values
df['name'] = df['name'].fillna("Unknown")
print(df['name'])

0        Ivan
1       ALICE
2       Heidi
3         BOB
4       frank
5     Unknown
6        Eve 
7       Grace
8     Unknown
9        Ivan
10      frank
11     alice 
12     alice 
13     alice 
14     alice 
15      frank
16      Heidi
17    charlie
18       Ivan
19    charlie
20      frank
21       Eve 
22      ALICE
23       Ivan
24       judy
25    charlie
26      ALICE
27    charlie
28        BOB
29      ALICE
30      Heidi
31     alice 
32      Heidi
33      Heidi
34      Heidi
35       Ivan
36      Grace
37      frank
38       Eve 
39      ALICE
40       bob 
41        BOB
42      Grace
43    charlie
44       Eve 
45       Eve 
46    Unknown
47      Heidi
48      ALICE
49       judy
Name: name, dtype: object


In [19]:
print(df[df['name'] == "Unknown"])

    id     name age     city salary   join_date department
5    4  Unknown  30  chicago  62000  05-05-2018    Finance
8   11  Unknown  45      NaN  72000  20-06-2019    Finance
46   9  Unknown  33  chicago  70000  15-12-2020         IT


In [20]:
df.head(20)

Unnamed: 0,id,name,age,city,salary,join_date,department
0,19,Ivan,28.0,Chicago,52000,06-05-2025,
1,14,ALICE,22.0,los angeles,abc,05-05-2018,IT
2,4,Heidi,33.0,chicago,62000,10-09-2018,Finance
3,8,BOB,33.0,Chicago,58000,20-06-2019,Finance
4,3,frank,22.0,chicago,52000,,Finanace
5,4,Unknown,30.0,chicago,62000,05-05-2018,Finance
6,7,Eve,28.0,chicago,52000,23-07-2021,IT
7,10,Grace,33.0,New york,-1000,11-11-2019,Finanace
8,11,Unknown,45.0,,72000,20-06-2019,Finance
9,8,Ivan,,Chicago,abc,01-03-2021,Finance


In [21]:
df['age'] = pd.to_numeric(df['age'], errors = 'coerce')
df['age'] = df['age'].fillna(df['age'].median())
print(df['age'])

0     28.0
1     22.0
2     33.0
3     33.0
4     22.0
5     30.0
6     28.0
7     33.0
8     45.0
9     33.0
10    22.0
11    45.0
12    33.0
13    28.0
14    25.0
15    35.0
16    35.0
17    28.0
18    22.0
19    33.0
20    33.0
21    45.0
22    33.0
23    45.0
24    28.0
25    33.0
26    25.0
27    40.0
28    22.0
29    33.0
30    22.0
31    40.0
32    28.0
33    33.0
34    45.0
35    35.0
36    35.0
37    25.0
38    25.0
39    35.0
40    33.0
41    28.0
42    33.0
43    35.0
44    33.0
45    35.0
46    33.0
47    45.0
48    33.0
49    33.0
Name: age, dtype: float64


In [22]:
df['city']

0         Chicago
1     los angeles
2         chicago
3         Chicago
4         chicago
5         chicago
6         chicago
7        New york
8             NaN
9         Chicago
10       new york
11      new york 
12            NaN
13            NaN
14    los angeles
15    los angeles
16      new york 
17        chicago
18    los angeles
19       New york
20    los angeles
21        chicago
22       new york
23        chicago
24    los angeles
25      new york 
26            NaN
27        Unknown
28      new york 
29        Chicago
30        Unknown
31        chicago
32        Chicago
33    los angeles
34        Unknown
35        Unknown
36            NaN
37       new york
38       new york
39       New york
40    los angeles
41        chicago
42       New york
43        Unknown
44        Chicago
45        chicago
46        chicago
47    los angeles
48      new york 
49       new york
Name: city, dtype: object

In [23]:
df['city'] = df['city'].fillna("Unknown")
(df['city'])

0         Chicago
1     los angeles
2         chicago
3         Chicago
4         chicago
5         chicago
6         chicago
7        New york
8         Unknown
9         Chicago
10       new york
11      new york 
12        Unknown
13        Unknown
14    los angeles
15    los angeles
16      new york 
17        chicago
18    los angeles
19       New york
20    los angeles
21        chicago
22       new york
23        chicago
24    los angeles
25      new york 
26        Unknown
27        Unknown
28      new york 
29        Chicago
30        Unknown
31        chicago
32        Chicago
33    los angeles
34        Unknown
35        Unknown
36        Unknown
37       new york
38       new york
39       New york
40    los angeles
41        chicago
42       New york
43        Unknown
44        Chicago
45        chicago
46        chicago
47    los angeles
48      new york 
49       new york
Name: city, dtype: object

In [24]:
df.head(30)

Unnamed: 0,id,name,age,city,salary,join_date,department
0,19,Ivan,28.0,Chicago,52000,06-05-2025,
1,14,ALICE,22.0,los angeles,abc,05-05-2018,IT
2,4,Heidi,33.0,chicago,62000,10-09-2018,Finance
3,8,BOB,33.0,Chicago,58000,20-06-2019,Finance
4,3,frank,22.0,chicago,52000,,Finanace
5,4,Unknown,30.0,chicago,62000,05-05-2018,Finance
6,7,Eve,28.0,chicago,52000,23-07-2021,IT
7,10,Grace,33.0,New york,-1000,11-11-2019,Finanace
8,11,Unknown,45.0,Unknown,72000,20-06-2019,Finance
9,8,Ivan,33.0,Chicago,abc,01-03-2021,Finance


In [25]:
df['salary'] = pd.to_numeric(df['salary'], errors = 'coerce')
df['salary'] = df['salary'].fillna(df['salary'].median())
(df['salary'])

0     52000.0
1     60000.0
2     62000.0
3     58000.0
4     52000.0
5     62000.0
6     52000.0
7     -1000.0
8     72000.0
9     60000.0
10    52000.0
11    70000.0
12    70000.0
13    65000.0
14    62000.0
15    52000.0
16    60000.0
17    60000.0
18    60000.0
19    60000.0
20    58000.0
21    50000.0
22    72000.0
23    72000.0
24    60000.0
25    50000.0
26    60000.0
27    60000.0
28    62000.0
29    60000.0
30    62000.0
31    52000.0
32    60000.0
33    55000.0
34    60000.0
35    58000.0
36    72000.0
37    65000.0
38    55000.0
39    55000.0
40    60000.0
41    72000.0
42    62000.0
43    22000.0
44    -1000.0
45    60000.0
46    70000.0
47    52000.0
48    50000.0
49    60000.0
Name: salary, dtype: float64

In [26]:
# convert joun date to determine and fill missing value with mode
df['join_date'] = pd.to_datetime(df['join_date'], errors= 'coerce', dayfirst = True)
df['join_date'] = df['join_date'].fillna(df['join_date'].mode()[0])
print(df['join_date'])

0    2025-05-06
1    2018-05-05
2    2018-09-10
3    2019-06-20
4    2019-06-20
5    2018-05-05
6    2021-07-23
7    2019-11-11
8    2019-06-20
9    2021-03-01
10   2021-03-01
11   2019-06-20
12   2019-06-20
13   2020-08-30
14   2021-07-23
15   2020-08-30
16   2020-01-15
17   2018-09-10
18   2020-01-15
19   2021-07-23
20   2021-07-23
21   2020-08-30
22   2019-06-20
23   2020-01-15
24   2018-05-05
25   2019-06-20
26   2019-06-20
27   2019-06-20
28   2020-12-15
29   2020-12-15
30   2019-11-11
31   2020-01-15
32   2018-05-05
33   2021-07-23
34   2019-06-20
35   2020-01-15
36   2020-08-30
37   2019-06-20
38   2019-11-11
39   2019-06-20
40   2019-06-20
41   2018-05-05
42   2020-12-15
43   2019-06-20
44   2021-03-01
45   2020-08-30
46   2020-12-15
47   2020-01-15
48   2018-05-05
49   2019-06-20
Name: join_date, dtype: datetime64[ns]


In [27]:
df.head(30)

Unnamed: 0,id,name,age,city,salary,join_date,department
0,19,Ivan,28.0,Chicago,52000.0,2025-05-06,
1,14,ALICE,22.0,los angeles,60000.0,2018-05-05,IT
2,4,Heidi,33.0,chicago,62000.0,2018-09-10,Finance
3,8,BOB,33.0,Chicago,58000.0,2019-06-20,Finance
4,3,frank,22.0,chicago,52000.0,2019-06-20,Finanace
5,4,Unknown,30.0,chicago,62000.0,2018-05-05,Finance
6,7,Eve,28.0,chicago,52000.0,2021-07-23,IT
7,10,Grace,33.0,New york,-1000.0,2019-11-11,Finanace
8,11,Unknown,45.0,Unknown,72000.0,2019-06-20,Finance
9,8,Ivan,33.0,Chicago,60000.0,2021-03-01,Finance


In [28]:
df['department'] = df['department'].fillna("Unknown")
print(df['department'])

0      Unknown
1           IT
2      Finance
3      Finance
4     Finanace
5      Finance
6           IT
7     Finanace
8      Finance
9      Finance
10          HR
11          IT
12          HR
13          IT
14     Finance
15     Unknown
16          IT
17          HR
18          HR
19     Finance
20     Finance
21          IT
22          HR
23          IT
24     Unknown
25     Finance
26          HR
27          IT
28     Finance
29     Finance
30     Finance
31     Finance
32          IT
33          HR
34          HR
35     Finance
36          HR
37     Finance
38          HR
39     Finance
40          HR
41    Finanace
42    Finanace
43          HR
44          HR
45     Finance
46          IT
47     Finance
48          IT
49     Finance
Name: department, dtype: object


In [29]:
df.head(20)

Unnamed: 0,id,name,age,city,salary,join_date,department
0,19,Ivan,28.0,Chicago,52000.0,2025-05-06,Unknown
1,14,ALICE,22.0,los angeles,60000.0,2018-05-05,IT
2,4,Heidi,33.0,chicago,62000.0,2018-09-10,Finance
3,8,BOB,33.0,Chicago,58000.0,2019-06-20,Finance
4,3,frank,22.0,chicago,52000.0,2019-06-20,Finanace
5,4,Unknown,30.0,chicago,62000.0,2018-05-05,Finance
6,7,Eve,28.0,chicago,52000.0,2021-07-23,IT
7,10,Grace,33.0,New york,-1000.0,2019-11-11,Finanace
8,11,Unknown,45.0,Unknown,72000.0,2019-06-20,Finance
9,8,Ivan,33.0,Chicago,60000.0,2021-03-01,Finance


In [30]:
df['name'] = df['name'].str.title() # Name in proper case
df['city'] = df['city'].str.title().str.strip()
df['department'] = df['department'].str.title().str.strip()

In [31]:
print(df.head(10))

   id     name   age         city   salary  join_date department
0  19     Ivan  28.0      Chicago  52000.0 2025-05-06    Unknown
1  14    Alice  22.0  Los Angeles  60000.0 2018-05-05         It
2   4    Heidi  33.0      Chicago  62000.0 2018-09-10    Finance
3   8      Bob  33.0      Chicago  58000.0 2019-06-20    Finance
4   3    Frank  22.0      Chicago  52000.0 2019-06-20   Finanace
5   4  Unknown  30.0      Chicago  62000.0 2018-05-05    Finance
6   7     Eve   28.0      Chicago  52000.0 2021-07-23         It
7  10    Grace  33.0     New York  -1000.0 2019-11-11   Finanace
8  11  Unknown  45.0      Unknown  72000.0 2019-06-20    Finance
9   8     Ivan  33.0      Chicago  60000.0 2021-03-01    Finance


In [32]:
print(df[['name','city', 'department']].head())

    name         city department
0   Ivan      Chicago    Unknown
1  Alice  Los Angeles         It
2  Heidi      Chicago    Finance
3    Bob      Chicago    Finance
4  Frank      Chicago   Finanace


In [33]:
df.tail(25)

Unnamed: 0,id,name,age,city,salary,join_date,department
25,5,Charlie,33.0,New York,50000.0,2019-06-20,Finance
26,6,Alice,25.0,Unknown,60000.0,2019-06-20,Hr
27,16,Charlie,40.0,Unknown,60000.0,2019-06-20,It
28,1,Bob,22.0,New York,62000.0,2020-12-15,Finance
29,4,Alice,33.0,Chicago,60000.0,2020-12-15,Finance
30,18,Heidi,22.0,Unknown,62000.0,2019-11-11,Finance
31,18,Alice,40.0,Chicago,52000.0,2020-01-15,Finance
32,16,Heidi,28.0,Chicago,60000.0,2018-05-05,It
33,20,Heidi,33.0,Los Angeles,55000.0,2021-07-23,Hr
34,19,Heidi,45.0,Unknown,60000.0,2019-06-20,Hr


In [34]:
department_correction = { "Finanace" : "Finance","Accouting" : "Accounting", "It":"IT", "Humman Resource":"Human Resource"}
df['department'] = df['department'].replace(department_correction)
print("\nAfter Fixing Typos:\n",df['department'].unique())


After Fixing Typos:
 ['Unknown' 'IT' 'Finance' 'Hr']


In [35]:
df.head(20)

Unnamed: 0,id,name,age,city,salary,join_date,department
0,19,Ivan,28.0,Chicago,52000.0,2025-05-06,Unknown
1,14,Alice,22.0,Los Angeles,60000.0,2018-05-05,IT
2,4,Heidi,33.0,Chicago,62000.0,2018-09-10,Finance
3,8,Bob,33.0,Chicago,58000.0,2019-06-20,Finance
4,3,Frank,22.0,Chicago,52000.0,2019-06-20,Finance
5,4,Unknown,30.0,Chicago,62000.0,2018-05-05,Finance
6,7,Eve,28.0,Chicago,52000.0,2021-07-23,IT
7,10,Grace,33.0,New York,-1000.0,2019-11-11,Finance
8,11,Unknown,45.0,Unknown,72000.0,2019-06-20,Finance
9,8,Ivan,33.0,Chicago,60000.0,2021-03-01,Finance
