# Version 1

In [None]:
import os
import pandas as pd

# Define the directory where the CSV files are stored
csv_directory = r'us-car-models-data-master'

# List of selected makes (converted to lowercase for case-insensitive comparison)
selected_makes = ['toyota', 'hyundai', 'kia', 'chevrolet', 'changan',
                  'nissan','suzuki', 'dfsk','jac','volkswagen',
                  'mitsubishi', 'ford','chery','mazda', 'renault',
                  'honda','great wall','subaru','mg','foton']

# Initialize an empty DataFrame to store the filtered data
filtered_data = pd.DataFrame()

# Loop through the years and process each CSV file
for year in range(1992, 2025):  # Adjust range for years
    csv_file = os.path.join(csv_directory, f'{year}.csv')
    
    if os.path.exists(csv_file):
        # Read the CSV file
        df = pd.read_csv(csv_file)

        #print(df.columns)
        
        # Ensure that the make column is lowercase for case-insensitive comparison
        df['make'] = df['make'].str.lower()
        
        # Filter rows where the make (in lowercase) is in the selected makes list
        df_filtered = df[df['make'].isin(selected_makes)]
        
        # Append the filtered data to the main DataFrame
        filtered_data = pd.concat([filtered_data, df_filtered], ignore_index=True)

# Save the filtered data to a new CSV file
filtered_data.to_csv('new_models.csv', index=False)


# Version 2

In [2]:
import os
import pandas as pd

# Directorio con los archivos CSV por año
csv_directory = r'us-car-models-data-master'

# Lista de marcas seleccionadas (en minúsculas)
selected_makes = [
    'toyota', 'hyundai', 'kia', 'chevrolet', 'changan',
    'nissan', 'suzuki', 'dfsk', 'jac', 'volkswagen',
    'mitsubishi', 'ford', 'chery', 'mazda', 'renault',
    'honda', 'great wall', 'subaru', 'mg', 'foton'
]

# DataFrame para acumular datos
filtered_data = pd.DataFrame()

# Leer y filtrar archivos por año
for year in range(1992, 2025):
    csv_file = os.path.join(csv_directory, f'{year}.csv')
    
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file)
        df['make'] = df['make'].str.lower()
        df_filtered = df[df['make'].isin(selected_makes)].copy()
        df_filtered['year'] = year  # Agregar columna del año
        filtered_data = pd.concat([filtered_data, df_filtered], ignore_index=True)

# Guardar los datos filtrados en CSV
filtered_data.to_csv('new_models.csv', index=False)

# Crear resumen por marca
summary = (
    filtered_data
    .groupby('make')
    .agg(
        cantidad_modelos=('model', 'nunique'),
        años=('year', lambda x: ','.join(map(str, sorted(set(x)))))
    )
    .reset_index()
    .rename(columns={'make': 'marca'})
)

# Guardar el resumen en un archivo Excel
summary.to_excel('resumen_marcas.xlsx', index=False)
