# Data Analytics I: Assignment 2

In [11]:
import numpy as np
import pandas as pd
import csv
import os
import sys


In [12]:
# extracting data from the csv
def extract_data(file):
    data = pd.read_csv(file)
    return data

data = extract_data('Electric_Vehicle_Data.csv')

In [13]:
# extracting the columns
data.columns

Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
       'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
      dtype='object')

In [14]:
# performing attribute-oriented induction using the following steps
# Input: (i) a relational database,(ii) a concept hierarchy table,(iii) the
# learning task, and (iv) the threshold value (T).
# • Output: A characteristic rule for the target class learned from the
# database
# • Step 1. Select the task-relevant data relation P.
# • Step 2. Perform attribute-oriented induction, which is described by the
# following procedure
# • Generalization is performed on each attribute of P.
# • Step 3: Simplify the generalized relation
# • If only one attribute of several tuples contains distinct values, the
# several tuples can be reduced into one by taking the distinct values
# of that attribute as a set.
# • Step 4: Transform the final relation into logic formulas
# • Step 5: Output the logic formulas as characteristic rules.


In [15]:
# step 1: Select the task-relevant data relation P.
# Make, Model, electric Vehicle Type,Electric Range,Base MSRP Model Year are what i want to select
data_reduced = data[['Make', 'Model', 'Electric Vehicle Type', 'Electric Range', 'Base MSRP', 'Model Year', 'State']]

In [16]:
data_reduced.head()

Unnamed: 0,Make,Model,Electric Vehicle Type,Electric Range,Base MSRP,Model Year,State
0,AUDI,A3,Plug-in Hybrid Electric Vehicle (PHEV),16,0,2017,WA
1,AUDI,A3,Plug-in Hybrid Electric Vehicle (PHEV),16,0,2018,WA
2,TESLA,MODEL S,Battery Electric Vehicle (BEV),210,0,2017,WA
3,JEEP,WRANGLER,Plug-in Hybrid Electric Vehicle (PHEV),25,0,2021,WA
4,TESLA,MODEL 3,Battery Electric Vehicle (BEV),308,0,2020,WA


In [17]:
def create_concept_hierarchy(data):
    concept_hierarchy = {}

    # Make
    makes = data['Make'].unique()
    make_categories = {
        'LuxuryEV': ['TESLA', 'BMW', 'AUDI', 'JAGUAR', 'PORSCHE'],
        'MassMarketEV': ['NISSAN', 'CHEVROLET', 'FORD', 'HYUNDAI', 'KIA'],
        'OtherEV': list(set(makes) - set(['TESLA', 'BMW', 'AUDI', 'JAGUAR', 'PORSCHE', 'NISSAN', 'CHEVROLET', 'FORD', 'HYUNDAI', 'KIA']))
    }
    concept_hierarchy['Make'] = {make: category for category, makes in make_categories.items() for make in makes}

    # Electric Vehicle Type
    concept_hierarchy['Electric Vehicle Type'] = {
        'Battery Electric Vehicle (BEV)': 'FullElectric',
        'Plug-in Hybrid Electric Vehicle (PHEV)': 'HybridElectric'
    }

    # Electric Range
    concept_hierarchy['Electric Range'] = lambda x: 'LongRange' if x > 200 else ('MediumRange' if x > 100 else 'ShortRange')

    # Base MSRP
    concept_hierarchy['Base MSRP'] = lambda x: 'HighEnd' if x > 50000 else ('MidRange' if x > 30000 else 'Affordable')

    # Model Year
    concept_hierarchy['Model Year'] = lambda x: str(x // 10 * 10) + 's'  # Generalize to decades

    # State
    states = data['State'].unique()
    regions = {
        'West': ['WA', 'OR', 'CA', 'NV', 'ID', 'MT', 'WY', 'UT', 'CO', 'AZ', 'NM'],
        'Midwest': ['ND', 'SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'IN', 'MI', 'OH'],
        'South': ['TX', 'OK', 'AR', 'LA', 'MS', 'AL', 'TN', 'KY', 'WV', 'VA', 'NC', 'SC', 'GA', 'FL'],
        'Northeast': ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'PA', 'NJ', 'DE', 'MD', 'DC']
    }
    concept_hierarchy['State'] = {state: region for region, states in regions.items() for state in states}

    return concept_hierarchy

hierarchy = create_concept_hierarchy(data)


In [18]:
# Print the concept hierarchy
for attribute, mapping in hierarchy.items():
    print(f"\n{attribute}:")
    if callable(mapping):
        print(f"  Custom function: {mapping.__name__}")
    else:
        for key, value in list(mapping.items())[:5]:  # Print first 5 items as an example
            print(f"  {key}: {value}")
        if len(mapping) > 5:
            print(f"  ... and {len(mapping) - 5} more")


Make:
  TESLA: LuxuryEV
  BMW: LuxuryEV
  AUDI: LuxuryEV
  JAGUAR: LuxuryEV
  PORSCHE: LuxuryEV
  ... and 35 more

Electric Vehicle Type:
  Battery Electric Vehicle (BEV): FullElectric
  Plug-in Hybrid Electric Vehicle (PHEV): HybridElectric

Electric Range:
  Custom function: <lambda>

Base MSRP:
  Custom function: <lambda>

Model Year:
  Custom function: <lambda>

State:
  WA: West
  OR: West
  CA: West
  NV: West
  ID: West
  ... and 44 more


In [19]:
def attribute_oriented_induction(data, concept_hierarchy, learning_task, threshold):
    # Step 1: Select the task-relevant data relation P
    relevant_columns = ['Make', 'Model', 'Electric Vehicle Type', 'Electric Range', 'Base MSRP', 'Model Year', 'State']
    data_reduced = data[relevant_columns]

    # Step 2: Perform attribute-oriented induction
    generalized_data = generalize_attributes(data_reduced, concept_hierarchy)

    # Step 3: Simplify the generalized relation
    simplified_data = simplify_relation(generalized_data)

    # Step 4: Transform the final relation into logic formulas
    logic_formulas = transform_to_logic(simplified_data)

    # Step 5: Output the logic formulas as characteristic rules
    characteristic_rules = generate_characteristic_rules(logic_formulas, learning_task, threshold)

    return characteristic_rules

def generalize_attributes(data, concept_hierarchy):
    generalized_data = data.copy()
    for column in data.columns:
        if column in concept_hierarchy:
            generalized_data[column] = data[column].map(concept_hierarchy[column])
    return generalized_data

def simplify_relation(data):
    # Group by all columns except the one with distinct values
    grouped = data.groupby(list(data.columns[:-1]))
    
    # Aggregate the last column as a set of distinct values
    simplified = grouped.agg({data.columns[-1]: lambda x: set(x)}).reset_index()
    
    return simplified

def transform_to_logic(data):
    logic_formulas = []
    for _, row in data.iterrows():
        formula = " ∧ ".join([f"{col}({val})" for col, val in row.items()])
        logic_formulas.append(formula)
    return logic_formulas

def generate_characteristic_rules(logic_formulas, learning_task, threshold):
    # Filter logic formulas based on the learning task and threshold
    characteristic_rules = [formula for formula in logic_formulas if evaluate_rule(formula, learning_task, threshold)]
    return characteristic_rules

def evaluate_rule(rule, learning_task, threshold):
    # Implement your rule evaluation logic here
    # This is a placeholder implementation
    return True

# Example usage
data = pd.read_csv('Electric_Vehicle_Data.csv')

# Define a simple concept hierarchy (you should expand this based on your domain knowledge)
# concept_hierarchy = {
#     'Make': {'TESLA': 'LuxuryEV', 'NISSAN': 'MassMarketEV'},
#     'Electric Vehicle Type': {
#         'Battery Electric Vehicle (BEV)': 'FullElectric',
#         'Plug-in Hybrid Electric Vehicle (PHEV)': 'HybridElectric'
#     },
#     'Electric Range': lambda x: 'LongRange' if x > 200 else 'ShortRange',
#     'Base MSRP': lambda x: 'HighEnd' if x > 50000 else 'Affordable',
#     'Model Year': lambda x: str(x // 10 * 10) + 's'  # Generalize to decades
# }

learning_task = "Characterize electric vehicles"
threshold = 0.1

characteristic_rules = attribute_oriented_induction(data, hierarchy, learning_task, threshold)

print("Characteristic Rules:")
for rule in characteristic_rules:
    print(rule)

Characteristic Rules:
Make(LuxuryEV) ∧ Model(330E) ∧ Electric Vehicle Type(HybridElectric) ∧ Electric Range(ShortRange) ∧ Base MSRP(Affordable) ∧ Model Year(2020s) ∧ State({'West'})
Make(LuxuryEV) ∧ Model(330E) ∧ Electric Vehicle Type(HybridElectric) ∧ Electric Range(ShortRange) ∧ Base MSRP(MidRange) ∧ Model Year(2010s) ∧ State({'West'})
Make(LuxuryEV) ∧ Model(530E) ∧ Electric Vehicle Type(HybridElectric) ∧ Electric Range(ShortRange) ∧ Base MSRP(Affordable) ∧ Model Year(2020s) ∧ State({'West'})
Make(LuxuryEV) ∧ Model(530E) ∧ Electric Vehicle Type(HybridElectric) ∧ Electric Range(ShortRange) ∧ Base MSRP(HighEnd) ∧ Model Year(2010s) ∧ State({'West'})
Make(LuxuryEV) ∧ Model(740E) ∧ Electric Vehicle Type(HybridElectric) ∧ Electric Range(ShortRange) ∧ Base MSRP(HighEnd) ∧ Model Year(2010s) ∧ State({'West'})
Make(LuxuryEV) ∧ Model(745E) ∧ Electric Vehicle Type(HybridElectric) ∧ Electric Range(ShortRange) ∧ Base MSRP(Affordable) ∧ Model Year(2020s) ∧ State({'West'})
Make(LuxuryEV) ∧ Model(745

In [20]:
# # step 2: Perform attribute-oriented induction, which is described by the following procedure
# # Generalization is performed on each attribute of P.

# # Generalization of Make
# make = data_reduced['Make'].unique()
# make

# # Generalization of Model
# model = data_reduced['Model'].unique()
# model

# # Generalization of Electric Vehicle Type
# electric_vehicle_type = data_reduced['Electric Vehicle Type'].unique()
# electric_vehicle_type

# # Generalization of Electric Range
# electric_range = data_reduced['Electric Range'].unique()
# electric_range




# Generalizing the rules
generalize_rules(rules)
print('Generalized rules:')
generate_rules(rules)

# Removing attributes
remove_attributes(rules, 'Vehicle')
print('Removed attributes:')
generate_rules(rules)

# Generating rules
print('Generated rules:')
generate_rules(rules)

# Evaluating the rules
print('Evaluated rules:')
evaluate_rules(rules)

# Extracting rules
rules = extract_rules(extract_data('Electric_Vehicle_Data.csv'))
print('Extracted rules:')
generate_rules(rules)
