# Data Analytics I: Assignment 2

In [23]:
import numpy as np
import pandas as pd
import csv
import os
import sys


In [24]:
# extracting data from the csv
def extract_data(file):
    data = pd.read_csv(file)
    return data

data = extract_data('Electric_Vehicle_Data.csv')

In [25]:
# extracting the columns
data.columns

Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
       'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
      dtype='object')

In [26]:
# performing attribute-oriented induction using the following steps
# Input: (i) a relational database,(ii) a concept hierarchy table,(iii) the
# learning task, and (iv) the threshold value (T).
# • Output: A characteristic rule for the target class learned from the
# database
# • Step 1. Select the task-relevant data relation P.
# • Step 2. Perform attribute-oriented induction, which is described by the
# following procedure
# • Generalization is performed on each attribute of P.
# • Step 3: Simplify the generalized relation
# • If only one attribute of several tuples contains distinct values, the
# several tuples can be reduced into one by taking the distinct values
# of that attribute as a set.
# • Step 4: Transform the final relation into logic formulas
# • Step 5: Output the logic formulas as characteristic rules.


In [27]:
# step 1: Select the task-relevant data relation P.
# Make, Model, electric Vehicle Type,Electric Range,Base MSRP Model Year are what i want to select
data_reduced = data[['Make', 'Model', 'Electric Vehicle Type', 'Electric Range', 'Base MSRP', 'Model Year', 'State']]

In [28]:
data_reduced.head()

Unnamed: 0,Make,Model,Electric Vehicle Type,Electric Range,Base MSRP,Model Year,State
0,AUDI,A3,Plug-in Hybrid Electric Vehicle (PHEV),16,0,2017,WA
1,AUDI,A3,Plug-in Hybrid Electric Vehicle (PHEV),16,0,2018,WA
2,TESLA,MODEL S,Battery Electric Vehicle (BEV),210,0,2017,WA
3,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),25,0,2021,WA
4,TESLA,MODEL 3,Battery Electric Vehicle (BEV),308,0,2020,WA


In [29]:
def create_concept_hierarchy(data):
    concept_hierarchy = {}

    # Make
    makes = data['Make'].unique()
    concept_hierarchy['Make'] = {make: make for make in makes}
    # Electric Vehicle Type
    concept_hierarchy['Electric Vehicle Type'] = {
        'Battery Electric Vehicle (BEV)': 'FullElectric',
        'Plug-in Hybrid Electric Vehicle (PHEV)': 'HybridElectric'
    }

    # Electric Range
    concept_hierarchy['Electric Range'] = lambda x: 'LongRange' if x > 200 else ('MediumRange' if x > 100 else 'ShortRange')

    # Base MSRP
    concept_hierarchy['Base MSRP'] = lambda x: 'HighEnd' if x > 50000 else ('MidRange' if x > 30000 else 'Affordable')

    # Model Year
    concept_hierarchy['Model Year'] = lambda x: str(x // 10 * 10) + 's'  # Generalize to decades

    # State
    states = data['State'].unique()
    regions = {
        'West': ['WA', 'OR', 'CA', 'NV', 'ID', 'MT', 'WY', 'UT', 'CO', 'AZ', 'NM'],
        'Midwest': ['ND', 'SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'IN', 'MI', 'OH'],
        'South': ['TX', 'OK', 'AR', 'LA', 'MS', 'AL', 'TN', 'KY', 'WV', 'VA', 'NC', 'SC', 'GA', 'FL'],
        'Northeast': ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'PA', 'NJ', 'DE', 'MD', 'DC']
    }
    concept_hierarchy['Region'] = {state: region for region, states in regions.items() for state in states}

    return concept_hierarchy

hierarchy = create_concept_hierarchy(data)


In [30]:
# Print the concept hierarchy
for attribute, mapping in hierarchy.items():
    print(f"\n{attribute}:")
    if callable(mapping):
        print(f"  Custom function: {mapping.__name__}")
    else:
        for key, value in list(mapping.items())[:5]:  # Print first 5 items as an example
            print(f"  {key}: {value}")
        if len(mapping) > 5:
            print(f"  ... and {len(mapping) - 5} more")


Make:
  AUDI: AUDI
  TESLA: TESLA
  JEEP: JEEP
  CHEVROLET: CHEVROLET
  KIA: KIA
  ... and 35 more

Electric Vehicle Type:
  Battery Electric Vehicle (BEV): FullElectric
  Plug-in Hybrid Electric Vehicle (PHEV): HybridElectric

Electric Range:
  Custom function: <lambda>

Base MSRP:
  Custom function: <lambda>

Model Year:
  Custom function: <lambda>

Region:
  WA: West
  OR: West
  CA: West
  NV: West
  ID: West
  ... and 44 more


In [31]:
from collections import Counter


def attribute_oriented_induction(data, concept_hierarchy):


    # Step 2: Perform attribute-oriented induction
    generalized_data = generalize_attributes(data, concept_hierarchy)

    # Step 3: Simplify the generalized relation
    simplified_data = simplify_relation(generalized_data)
    characteristic_rules = extract_rules(simplified_data)

    return characteristic_rules, simplified_data

def generalize_attributes(data, concept_hierarchy):
    generalized_data = data.copy()
    for column in data.columns:
        if column in concept_hierarchy:
            generalized_data[column] = data[column].map(concept_hierarchy[column])
    return generalized_data

def simplify_relation(data):
    # Group by all columns except the one with distinct values
    grouped = data.groupby(list(data.columns[:-1]))
    
    # Aggregate the last column as a set of distinct values
    simplified = grouped.agg({data.columns[-1]: lambda x: set(x)}).reset_index()
    
    return simplified

def extract_rules(data, min_support=0.1, min_confidence=0.7):
    total_records = len(data)
    rules = []

    for column in data.columns[:-1]:  # Exclude the last column (which contains sets of values)
        value_counts = Counter(data[column])
        
        for value, count in value_counts.items():
            support = count / total_records
            if support >= min_support:
                consequent_counts = Counter([item for items in data[data[column] == value][data.columns[-1]] for item in items])
                
                for consequent, consequent_count in consequent_counts.items():
                    confidence = consequent_count / count
                    if confidence >= min_confidence:
                        rule = f"If {column} is {value}, then {data.columns[-1]} is likely to be {consequent}"
                        rules.append((rule, support, confidence))

    return rules

characteristic_rules, simplified_data = attribute_oriented_induction(data_reduced, hierarchy)

print("Characteristic Rules:")
for rule in characteristic_rules:
    print(rule)

Characteristic Rules:
('If Make is BMW, then State is likely to be WA', 0.11627906976744186, 1.0)
('If Electric Vehicle Type is HybridElectric, then State is likely to be WA', 0.44651162790697674, 1.0)
('If Electric Vehicle Type is FullElectric, then State is likely to be WA', 0.5534883720930233, 1.0)
('If Electric Range is ShortRange, then State is likely to be WA', 0.8232558139534883, 1.0)
('If Electric Range is LongRange, then State is likely to be WA', 0.11162790697674418, 1.0)
('If Base MSRP is Affordable, then State is likely to be WA', 0.9069767441860465, 1.0)
('If Model Year is 2020s, then State is likely to be WA', 0.6139534883720931, 1.0)
('If Model Year is 2010s, then State is likely to be WA', 0.3627906976744186, 1.0)


In [32]:
simplified_data.head()

Unnamed: 0,Make,Model,Electric Vehicle Type,Electric Range,Base MSRP,Model Year,State
0,ALFA ROMEO,TONALE,HybridElectric,ShortRange,Affordable,2020s,{WA}
1,AUDI,A3,HybridElectric,ShortRange,Affordable,2010s,{WA}
2,AUDI,A7,HybridElectric,ShortRange,Affordable,2020s,{WA}
3,AUDI,A8 E,HybridElectric,ShortRange,Affordable,2020s,{WA}
4,AUDI,E-TRON,FullElectric,LongRange,Affordable,2010s,"{MD, WA}"


In [33]:
# grouping by make and counting number of electric vehicles
make_counts = simplified_data.groupby('Make').size()
# sorting the values
make_counts = make_counts.sort_values(ascending=False)
# printing top 10
make_counts.head(10)


Make
BMW              25
KIA              16
TESLA            16
AUDI             15
HYUNDAI          14
VOLVO            13
MERCEDES-BENZ    13
PORSCHE          10
FORD             10
CHEVROLET         9
dtype: int64


# Generalizing the rules
generalize_rules(rules)
print('Generalized rules:')
generate_rules(rules)

# Removing attributes
remove_attributes(rules, 'Vehicle')
print('Removed attributes:')
generate_rules(rules)

# Generating rules
print('Generated rules:')
generate_rules(rules)

# Evaluating the rules
print('Evaluated rules:')
evaluate_rules(rules)

# Extracting rules
rules = extract_rules(extract_data('Electric_Vehicle_Data.csv'))
print('Extracted rules:')
generate_rules(rules)
