## Importing all the libraries for first question

In [697]:
import pandas as pd
import numpy as np


## Reading csv file in data

In [698]:
data = pd.read_csv("Auto MPG - Sheet1.csv")

In [699]:
print(data.head())

    mpg  cylinders  displacement horsepower  weight  acceleration  model year   
0  18.0          8         307.0        130    3504          12.0          70  \
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   
3  16.0          8         304.0        150    3433          12.0          70   
4  17.0          8         302.0        140    3449          10.5          70   

   origin  
0       1  
1       1  
2       1  
3       1  
4       1  


#### From the ucl website it's given already
#### That these are the configuration of data types 
  ######  1. mpg:           continuous
  ######  2. cylinders:     multi-valued discrete
  ######  3. displacement:  continuous
  ######  4. horsepower:    continuous
  ######  5. weight:        continuous
  ######  6. acceleration:  continuous
  ######  7. model year:    multi-valued discrete
  ######  8. origin:        multi-valued discrete

### Printing the unique values of cylinders , model year , origin

In [700]:
print((data['cylinders'].unique()))

[8 4 6 3 5]


In [701]:
print((data['model year'].unique()))

[70 71 72 73 74 75 76 77 78 79 80 81 82]


In [702]:
print((data['origin'].unique()))

[1 3 2]


In [703]:
df = pd.DataFrame(data)

In [704]:
discrete_columns = ['cylinders', 'model year', 'origin']

### Encoded this on basis of the the number of rows of that specefic unique value the least one gets the first 1 in first place , highest gets 1 in last place

Column - Cylinder 
#### For e.g.  - These are the unique values in this case [8 4 6 3 5]

| Value | Number of Rows | Encoding      |
|-------|----------------|---------------|
| 4     | 204            | 0 ,  0 ,  0 ,  0 , 1 |
| 8     | 103            | 0 ,  0 ,  0 ,  1 , 0 |
| 6     | 84             | 0 ,  0 ,  1 ,  0 , 0 |
| 3     | 4              | 0 ,  1 ,  0 ,  0 ,  0 |
| 5     | 3              | 1 ,  0 ,  0 ,  0 ,  0 |


## where each column has 1 denoting the number of numbers of 5 

In [705]:

# Create custom binary encoding for each discrete column
for column in discrete_columns:
    column_values = df[column]
    
    # Calculate unique values and their counts
    unique_values, value_counts = column_values.unique(), column_values.value_counts()

    # Iterate through unique values and print both the value and the number of rows
    
    print(value_counts)

    # print("column name is " , value_counts   )

    # Sort unique values based on counts in ascending order
    unique_values = unique_values[np.argsort(value_counts)]
    print("after sorting on basis of row " , unique_values)

    # Create custom binary encoding based on counts
    for i, unique_value in enumerate(unique_values):
        encoding = ['0'] * len(unique_values)
        encoding[i] = '1'
        encoding = ''.join(encoding)
        print('this is encoding for ' , unique_value  , "value is ",  encoding)
        # Create a new column with the custom encoding
        column_name = f'{column}_{unique_value}'
        df[column_name] = (column_values == unique_value).astype(int)


cylinders
4    204
8    103
6     84
3      4
5      3
Name: count, dtype: int64
after sorting on basis of row  [5 3 6 4 8]
this is encoding for  5 value is  10000
this is encoding for  3 value is  01000
this is encoding for  6 value is  00100
this is encoding for  4 value is  00010
this is encoding for  8 value is  00001
model year
73    40
78    36
76    34
82    31
75    30
70    29
79    29
80    29
81    29
71    28
72    28
77    28
74    27
Name: count, dtype: int64
after sorting on basis of row  [82 79 80 81 75 76 77 78 74 73 72 71 70]
this is encoding for  82 value is  1000000000000
this is encoding for  79 value is  0100000000000
this is encoding for  80 value is  0010000000000
this is encoding for  81 value is  0001000000000
this is encoding for  75 value is  0000100000000
this is encoding for  76 value is  0000010000000
this is encoding for  77 value is  0000001000000
this is encoding for  78 value is  0000000100000
this is encoding for  74 value is  0000000010000
this is e

##  Below are the columns in modified csv

### mpg , displacement , horsepower , weight , acceleration , cylinders_5 , cylinders_3 , cylinders_6 , cylinders_4 , cylinders_8 , model year_82 , model year_79 , model year_80 , model year_81 , model year_75 , model year_76 , model year_77 , model year_78 , model year_74 , model year_73 , model year_72 , model year_71 , model year_70 , origin_2 , origin_3 , origin_1

#### making each column of the unique values present in the continous-discrete value column

#### No need of columns which were discrete are now hot encoded so dropping them

In [706]:

# Drop the original discrete columns
df.drop(discrete_columns, axis=1, inplace=True)

# Display the modified DataFrame with the custom binary encoding
print(df)


      mpg  displacement horsepower  weight  acceleration  cylinders_5   
0    18.0         307.0        130    3504          12.0            0  \
1    15.0         350.0        165    3693          11.5            0   
2    18.0         318.0        150    3436          11.0            0   
3    16.0         304.0        150    3433          12.0            0   
4    17.0         302.0        140    3449          10.5            0   
..    ...           ...        ...     ...           ...          ...   
393  27.0         140.0         86    2790          15.6            0   
394  44.0          97.0         52    2130          24.6            0   
395  32.0         135.0         84    2295          11.6            0   
396  28.0         120.0         79    2625          18.6            0   
397  31.0         119.0         82    2720          19.4            0   

     cylinders_3  cylinders_6  cylinders_4  cylinders_8  ...  model year_77   
0              0            0            0  

## Find columns with missing data

In [707]:
non_numeric_columns = data.select_dtypes(exclude=['number'])


In [708]:
# Display the list of non-numeric columns
print(non_numeric_columns.columns)


Index(['horsepower'], dtype='object')


## From this there are some missing data in horsepower column

In [709]:
# Convert 'horsepower' column to numeric (replace '?' with NaN)
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

In [710]:
# Calculate the mean of 'horsepower' excluding NaN values
mean_horsepower = df['horsepower'].mean()

#### We assume that the missing values are near the mean of that column

In [711]:
# Replace NaN values (formerly '?') with the calculated mean
df['horsepower'].fillna(mean_horsepower, inplace=True)

In [712]:
# Display the updated DataFrame
print(df)

      mpg  displacement  horsepower  weight  acceleration  cylinders_5   
0    18.0         307.0       130.0    3504          12.0            0  \
1    15.0         350.0       165.0    3693          11.5            0   
2    18.0         318.0       150.0    3436          11.0            0   
3    16.0         304.0       150.0    3433          12.0            0   
4    17.0         302.0       140.0    3449          10.5            0   
..    ...           ...         ...     ...           ...          ...   
393  27.0         140.0        86.0    2790          15.6            0   
394  44.0          97.0        52.0    2130          24.6            0   
395  32.0         135.0        84.0    2295          11.6            0   
396  28.0         120.0        79.0    2625          18.6            0   
397  31.0         119.0        82.0    2720          19.4            0   

     cylinders_3  cylinders_6  cylinders_4  cylinders_8  ...  model year_77   
0              0            0   

### Verifying if there is any missinng value still?

In [713]:
# Check if there are any '?' in the 'horsepower' column
has_question_marks = (df['horsepower'] == '?')

# Check if there are any rows with '?' in the 'horsepower' column
if has_question_marks.any():
    print("There are still '?' in the 'horsepower' column.")
else:
    print("There are no '?' in the 'horsepower' column.")


There are no '?' in the 'horsepower' column.


#### Converting to csv with all the datacleaning and hot encoding done

In [714]:
df.to_csv('modified_data_binary.csv', index=False)

https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/

## Read the csv file and print the 5 rows

In [715]:
data = pd.read_csv("modified_data_binary.csv")

In [716]:
print(data.head())

    mpg  displacement  horsepower  weight  acceleration  cylinders_5   
0  18.0         307.0       130.0    3504          12.0            0  \
1  15.0         350.0       165.0    3693          11.5            0   
2  18.0         318.0       150.0    3436          11.0            0   
3  16.0         304.0       150.0    3433          12.0            0   
4  17.0         302.0       140.0    3449          10.5            0   

   cylinders_3  cylinders_6  cylinders_4  cylinders_8  ...  model year_77   
0            0            0            0            1  ...              0  \
1            0            0            0            1  ...              0   
2            0            0            0            1  ...              0   
3            0            0            0            1  ...              0   
4            0            0            0            1  ...              0   

   model year_78  model year_74  model year_73  model year_72  model year_71   
0              0        

  ----------------------------------------------------------------

1) b) Let D = {x1, x2, . . . , xn} be n objects consists of d features, i.e., for every 1 ≤ i ≤ n, xi = [xi1, xi2, . . . , xid] ∈ R
d. The variance σ2 of D is defined as σ2 :=
Xn i=1 (xi − x ̄)T (xi − x ̄) = 1nXn i=1∥xi − x ̄∥22, where for any a = [a1, . . . , ad] ∈ R d, the aT a = ∥a∥22 =P 1≤i≤d
(ai) 2 and the mean x ̄ = 1 n P 1≤i≤n xi . Now, use the file you have saved in (a) and compute the mean  ̄x and variance σ2 of the data in it.

## Print the mean of those columns

In [717]:
totalsum = 0
totallen = 0
meanofcols = {}

In [718]:
for column in data : 
  meanofcolumn = 0
  numberofrowsincol = len(data[column])
  for x in data[column] :
    meanofcolumn += x
    totalsum += x
    totallen += 1
  meanofcolumn = meanofcolumn / numberofrowsincol
  print(column ,'   ' , meanofcolumn)
  meanofcols[column] = meanofcolumn
  

mpg     23.514572864321615
displacement     193.42587939698493
horsepower     104.46938775510206
weight     2970.424623115578
acceleration     15.568090452261291
cylinders_5     0.007537688442211055
cylinders_3     0.010050251256281407
cylinders_6     0.21105527638190955
cylinders_4     0.5125628140703518
cylinders_8     0.25879396984924624
model year_82     0.07788944723618091
model year_79     0.0728643216080402
model year_80     0.0728643216080402
model year_81     0.0728643216080402
model year_75     0.07537688442211055
model year_76     0.08542713567839195
model year_77     0.07035175879396985
model year_78     0.09045226130653267
model year_74     0.0678391959798995
model year_73     0.10050251256281408
model year_72     0.07035175879396985
model year_71     0.07035175879396985
model year_70     0.0728643216080402
origin_2     0.17587939698492464
origin_3     0.1984924623115578
origin_1     0.6256281407035176


In [719]:
print("mean of data is " , np.mean(data))
# Calculate the variance for each numeric column


mean of data is  127.32317513785569


In [720]:

# Load the modified data from the CSV file
data = pd.read_csv("modified_data_binary.csv")


In [721]:

totalsum = 0
totallen = 0
varianceofcols = {}

In [722]:
for column in data : 
  varianceofcolumn = 0
  numberofrowsincol = len(data[column])
  for x in data[column] :
    # according to question
    varianceofcolumn += math.pow((x - meanofcols[column]) , 2)
    totalsum += x
    totallen += 1
  varianceofcolumn = varianceofcolumn / numberofrowsincol
  print(column ,'   ' , varianceofcolumn)
  varianceofcols[column] = varianceofcolumn
  


mpg     60.93611928991693
displacement     10844.882068950259
horsepower     1455.5116398318166
weight     715339.1287404363
acceleration     7.585740574732961
cylinders_5     0.007480871695159248
cylinders_3     0.009949243705967031
cylinders_6     0.16651094669326397
cylinders_4     0.2498421757026331
cylinders_8     0.19181965101891388
model year_82     0.07182268124542314
model year_79     0.06755511224464027
model year_80     0.06755511224464027
model year_81     0.06755511224464027
model year_75     0.06969520971692582
model year_76     0.07812934016817805
model year_77     0.06540238882856525
model year_78     0.08227064973106776
model year_74     0.0632370394687012
model year_73     0.09040175753137489
model year_72     0.06540238882856558
model year_71     0.06540238882856562
model year_70     0.06755511224464028
origin_2     0.14494583470114322
origin_3     0.15909320471705204
origin_1     0.23421757026337828


------------------------------------------------------------------------------------------------------------------

### Observation : Variance is extremely high for : displacement , weight , even horsepower( but way less than displacement) as compared to others
### We scale the data using ( value at that point - mean of all the points of that column)/variance of the data of that column          

In [723]:
df = pd.DataFrame(data)

In [724]:
# Normalize each feature with its mean and variance
normalized_data = {}
for column in df.columns:
    normalized_data[column] = [(x - meanofcols[column]) / (varianceofcols[column] ** 0.5) for x in df[column]]


In [725]:
print(normalized_data)

{'mpg': [-0.7064387006509839, -1.0907506236404472, -0.7064387006509839, -0.962646649310626, -0.8345426749808049, -1.0907506236404472, -1.218854597970268, -1.218854597970268, -1.218854597970268, -1.0907506236404472, -1.0907506236404472, -1.218854597970268, -1.0907506236404472, -1.218854597970268, 0.062185145327942455, -0.19402280333169966, -0.7064387006509839, -0.3221267776615207, 0.44649706831740565, 0.3183930939875846, 0.19028911965776352, 0.062185145327942455, 0.19028911965776352, 0.3183930939875846, -0.3221267776615207, -1.7312704952895523, -1.7312704952895523, -1.6031665209597314, -1.8593744696193735, 0.44649706831740565, 0.5746010426472267, 0.19028911965776352, 0.19028911965776352, -0.5783347263211629, -0.962646649310626, -0.8345426749808049, -0.5783347263211629, -0.7064387006509839, -1.218854597970268, -1.218854597970268, -1.218854597970268, -1.218854597970268, -1.4750625466299103, -1.3469585723000892, -1.3469585723000892, -0.7064387006509839, -0.19402280333169966, -0.57833472632

In [726]:
totalsum = 0
totallen = 0
meanofnormalizedcols = {}

In [727]:
for column in normalized_data : 
  meanofcolumn = 0
  numberofrowsincol = len(data[column])
  for x in data[column] :
    meanofcolumn += x
    totalsum += x
    totallen += 1
  meanofcolumn = meanofcolumn / numberofrowsincol
  print(column ,'   ' , meanofcolumn)
  meanofnormalizedcols[column] = meanofcolumn

mpg     23.514572864321615
displacement     193.42587939698493
horsepower     104.46938775510206
weight     2970.424623115578
acceleration     15.568090452261291
cylinders_5     0.007537688442211055
cylinders_3     0.010050251256281407
cylinders_6     0.21105527638190955
cylinders_4     0.5125628140703518
cylinders_8     0.25879396984924624
model year_82     0.07788944723618091
model year_79     0.0728643216080402
model year_80     0.0728643216080402
model year_81     0.0728643216080402
model year_75     0.07537688442211055
model year_76     0.08542713567839195
model year_77     0.07035175879396985
model year_78     0.09045226130653267
model year_74     0.0678391959798995
model year_73     0.10050251256281408
model year_72     0.07035175879396985
model year_71     0.07035175879396985
model year_70     0.0728643216080402
origin_2     0.17587939698492464
origin_3     0.1984924623115578
origin_1     0.6256281407035176


In [728]:
print("mean of data is " , np.mean(data))
# Calculate the variance for each numeric column

mean of data is  127.32317513785569


In [729]:
# Compute the variance of the normalized data
normalized_variances = {}

In [730]:
for column in normalized_data : 
  varianceofcolumn = 0
  numberofrowsincol = len(data[column])
  for x in normalized_data[column] :
    # according to question
    varianceofcolumn += math.pow((x - meanofnormalizedcols[column]) , 2)
    totalsum += x
    totallen += 1
  varianceofcolumn = varianceofcolumn / numberofrowsincol
  print(column ,'   ' , varianceofcolumn)
  normalized_variances[column] = varianceofcolumn
  

mpg     553.9351369914901
displacement     37414.57082049704
horsepower     10914.85297792586
weight     8823423.441611337
acceleration     243.36544032978892
cylinders_5     1.0000568167470514
cylinders_3     1.0001010075503225
cylinders_6     1.0445443296886598
cylinders_4     1.262720638367713
cylinders_8     1.0669743188303356
model year_82     1.0060667659907583
model year_79     1.0053092093634
model year_80     1.0053092093634
model year_81     1.0053092093634
model year_75     1.0056816747051784
model year_76     1.0072977955102038
model year_77     1.0049493699654024
model year_78     1.0081816115754587
model year_74     1.0046021565111873
model year_73     1.0101007550314607
model year_72     1.0049493699654004
model year_71     1.0049493699653995
model year_70     1.005309209363399
origin_2     1.0309335622837907
origin_3     1.0393992575945084
origin_1     1.3914105704401356


In [731]:
df = pd.DataFrame(data)

# Compute the variance of each column
variance = df.var()

# Normalize each feature with its mean and variance
normalized_df = (df - df.mean()) / df.std()

# Compute the variance of the normalized data
normalized_variance = normalized_df.std()

# Display the variance of the normalized data for each column
print(normalized_variance)


mpg              1.0
displacement     1.0
horsepower       1.0
weight           1.0
acceleration     1.0
cylinders_5      1.0
cylinders_3      1.0
cylinders_6      1.0
cylinders_4      1.0
cylinders_8      1.0
model year_82    1.0
model year_79    1.0
model year_80    1.0
model year_81    1.0
model year_75    1.0
model year_76    1.0
model year_77    1.0
model year_78    1.0
model year_74    1.0
model year_73    1.0
model year_72    1.0
model year_71    1.0
model year_70    1.0
origin_2         1.0
origin_3         1.0
origin_1         1.0
dtype: float64


In [732]:
normalized_data = {}
for column in df.columns:
    normalized_data[column] = [(x - meanofcols[column]) / (varianceofcols[column] ** 0.5) for x in df[column]]

# Compute the variance of the normalized data
normalized_varianceofcols = {}
for column in df.columns:
    normalized_varianceofcols[column] = sum((x - meanofcols[column]) ** 2 for x in normalized_data[column]) / (len(normalized_data[column]) - 1)

# Display the variance of the normalized data for each column
for column, normalized_variance in normalized_varianceofcols.items():
    print(f"Variance of normalized {column}: {normalized_variance}")


Variance of normalized mpg: 555.3304396035594
Variance of normalized displacement: 37508.814071934066
Variance of normalized horsepower: 10942.34631036396
Variance of normalized weight: 8845648.689575093
Variance of normalized acceleration: 243.97845151449872
Variance of normalized cylinders_5: 1.0025758515499408
Variance of normalized cylinders_3: 1.0026201536650587
Variance of normalized cylinders_6: 1.0471754237181023
Variance of normalized cylinders_4: 1.2659012948875308
Variance of normalized cylinders_8: 1.0696619115729813
Variance of normalized model year_82: 1.0086009392048407
Variance of normalized model year_79: 1.0078414743743909
Variance of normalized model year_80: 1.0078414743743909
Variance of normalized model year_81: 1.0078414743743909
Variance of normalized model year_75: 1.0082148779160227
Variance of normalized model year_76: 1.0098350695543101
Variance of normalized model year_77: 1.0074807285799248
Variance of normalized model year_78: 1.0107211118565051
Variance 

1 ) c) You might notice that the variance of the data is highly dominated by few
features compared to other features. So, normalize each feature of the saved
data with its mean and variance. Now compute the variance of the normalized
data.

In [733]:
df = pd.DataFrame(data)

# Compute the variance of each column
variance = df.var()

# Normalize each feature with its mean and variance
normalized_df = (df - df.mean()) / df.var()**0.5

# Compute the variance of the normalized data
normalized_variance = normalized_df.var()

# Display the variance of the normalized data for each column
print(normalized_variance)
# This code will give you the variance of the normalized data for each column, which will indicate how much each feature contributes to the variance of the data after normalization.


mpg              1.0
displacement     1.0
horsepower       1.0
weight           1.0
acceleration     1.0
cylinders_5      1.0
cylinders_3      1.0
cylinders_6      1.0
cylinders_4      1.0
cylinders_8      1.0
model year_82    1.0
model year_79    1.0
model year_80    1.0
model year_81    1.0
model year_75    1.0
model year_76    1.0
model year_77    1.0
model year_78    1.0
model year_74    1.0
model year_73    1.0
model year_72    1.0
model year_71    1.0
model year_70    1.0
origin_2         1.0
origin_3         1.0
origin_1         1.0
dtype: float64


--------------------------------------------------------------------------------------------------------------------------------------------