In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# State Level 2011 - 2022 GT MVT, Annually

In [2]:
directory = "C:/Users/tosea/GT-MVT-Annually-States"

# List all .csv files in the directory
csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an empty list to store data frames
dfs = []

# Read each .csv file into separate data frames and store them in the list
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Combine all data frames into a single data frame
combined_df = pd.concat(dfs, ignore_index=True)
combined_df = combined_df.rename(columns={"Unnamed: 0": "State"})
# Set "DMA" as the index
combined_df = combined_df.set_index("State")
gt_state_annual_mean_df = combined_df.groupby(level=0).mean().round(2)



In [3]:
gt_state_annual_mean_df = gt_state_annual_mean_df.stack().reset_index()

In [4]:
gt_state_annual_mean_df.columns = ["State", "Year", "GT MVT"]

In [5]:
gt_state_annual_mean_df = gt_state_annual_mean_df[["Year", "State",  "GT MVT"]]

In [6]:
gt_state_annual_mean_df = gt_state_annual_mean_df.sort_values(["Year", "State"])

In [7]:
gt_state_annual_mean_df.Year = gt_state_annual_mean_df.Year.str.replace("MVT_GT_", "")

In [8]:
gt_state_annual_mean_df.to_csv("gt_state_mvt_2011_2022.csv", index = False)

In [9]:
gt_state_annual_mean_df.head()

Unnamed: 0,Year,State,GT MVT
0,2011,Alabama,61.05
12,2011,Alaska,34.0
24,2011,Arizona,74.74
36,2011,Arkansas,40.5
48,2011,California,87.83


# DMA Level 2011-2022 GT MVT, Annually

In [10]:
directory = "C:/Users/tosea/GT-MVT-Annually-DMA"

# List all .csv files in the directory
csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an empty list to store data frames
dfs = []

# Read each .csv file into separate data frames and store them in the list
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Combine all data frames into a single data frame
combined_df = pd.concat(dfs, ignore_index=True)
combined_df = combined_df.rename(columns={"Unnamed: 0": "DMA"})
# Set "DMA" as the index
combined_df = combined_df.set_index("DMA")
gt_dma_annual_df = combined_df.groupby(level=0).mean().round(2)

In [11]:
gt_dma_annual_df = gt_dma_annual_df.stack().reset_index()

In [12]:
gt_dma_annual_df.columns = ["DMA", "Year", "GT MVT"]

In [13]:
gt_dma_annual_df = gt_dma_annual_df[["Year", "DMA",  "GT MVT"]]

In [14]:
gt_dma_annual_df.Year = gt_dma_annual_df.Year.str.replace("MVT_GT_", "")

In [15]:
gt_dma_annual_df = gt_dma_annual_df.sort_values(["Year", "DMA"])

In [16]:
gt_dma_annual_df.to_csv("gt_dma_mvt_annually_2011_2022.csv", index = False)

In [17]:
gt_dma_annual_df.head()

Unnamed: 0,Year,DMA,GT MVT
0,2011,Abilene-Sweetwater TX,56.97
12,2011,Albany GA,100.0
18,2011,Albany-Schenectady-Troy NY,23.03
30,2011,Albuquerque-Santa Fe NM,35.56
43,2011,Amarillo TX,48.56


# DMA Level 2017-2022 GT MVT, Monthly

In [18]:
directory = "C:/Users/tosea/GT-MVT-Monthly-City/csv_files"

# List all .csv files in the directory
csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]

# Initialize an empty list to store data frames
dfs = []

# Read each .csv file into separate data frames and store them in the list
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Combine all data frames into a single data frame
combined_df = pd.concat(dfs, ignore_index=True)
combined_df = combined_df.rename(columns={"Unnamed: 0": "year_month"})
# Set "DMA" as the index
combined_df = combined_df.set_index("year_month")
gt_dma_monthly_df = combined_df.groupby(level=0).mean().round(2)

In [19]:
gt_dma_monthly_df = gt_dma_monthly_df.reset_index()

In [20]:
gt_dma_monthly_df

Unnamed: 0,year_month,MVT_GT_Albuquerque-Santa Fe NM,MVT_GT_Atlanta GA,MVT_GT_Austin TX,MVT_GT_Bakersfield CA,MVT_GT_Baltimore MD,MVT_GT_Boston MA-Manchester NH,MVT_GT_Charlotte NC,MVT_GT_Chicago IL,MVT_GT_Cincinnati OH,...,MVT_GT_Raleigh-Durham (Fayetteville) NC,MVT_GT_Sacramento-Stockton-Modesto CA,MVT_GT_San Antonio TX,MVT_GT_San Diego CA,MVT_GT_San Francisco-Oakland-San Jose CA,MVT_GT_Seattle-Tacoma WA,MVT_GT_Tucson AZ,MVT_GT_Tulsa OK,MVT_GT_Washington DC (Hagerstown MD),MVT_GT_Wichita-Hutchinson KS
0,01/01/2015 00:00:00,,,,,,,,,49.33,...,,,,,,,,,,
1,01/01/2016 00:00:00,,,,,,,,,44.31,...,,,,,,,,,,
2,01/01/2017 00:00:00,,,,,,,,,46.59,...,,,,,,,,,,
3,01/01/2018 00:00:00,,,,,,,,,40.40,...,,,,,,,,,,
4,01/01/2019 00:00:00,,,,,,,,,36.47,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,2022-08-01 00:00:00,44.85,53.79,72.72,33.30,41.61,63.76,33.14,97.46,60.00,...,80.40,61.03,70.35,43.00,80.91,70.16,35.96,29.37,62.31,31.99
188,2022-09-01 00:00:00,37.12,75.69,49.66,25.33,52.78,43.04,72.20,72.33,82.00,...,39.73,57.62,41.15,60.67,73.77,78.09,36.64,37.64,63.74,31.96
189,2022-10-01 00:00:00,39.68,62.46,60.11,26.65,44.90,53.08,53.24,92.58,77.75,...,34.65,59.59,63.65,51.12,74.87,86.28,31.10,26.30,53.74,29.68
190,2022-11-01 00:00:00,30.24,70.20,52.00,23.78,56.61,39.05,61.34,71.47,,...,49.29,61.32,40.49,49.27,80.75,78.43,35.76,18.44,58.19,24.94


In [21]:
gt_dma_monthly_df.year_month = pd.to_datetime(gt_dma_monthly_df.year_month).dt.strftime("%Y-%m")

In [22]:
gt_dma_monthly_df.set_index('year_month', inplace=True)

In [23]:
gt_dma_monthly_df = gt_dma_monthly_df.stack().reset_index()

In [24]:
gt_dma_monthly_df.columns = ["year_month", "DMA", "GT MVT"]

In [25]:
gt_dma_monthly_df.DMA = gt_dma_monthly_df.DMA.str.replace("MVT_GT_", "")

In [26]:
gt_dma_monthly_df = gt_dma_monthly_df.sort_values(["year_month", "DMA"])

In [27]:
gt_dma_monthly_filtered = gt_dma_monthly_df[gt_dma_monthly_df.year_month >= "2017"]

In [28]:
# Rescale the 'mvt_data' column from 0 to 100
max_value = gt_dma_monthly_filtered['GT MVT'].max()
gt_dma_monthly_filtered['GT MVT'] = ((gt_dma_monthly_filtered['GT MVT']) / (max_value)) * 100

# Optionally, you can round the scaled values to maintain consistency
gt_dma_monthly_filtered['GT MVT'] = gt_dma_monthly_filtered['GT MVT'].round(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gt_dma_monthly_filtered['GT MVT'] = ((gt_dma_monthly_filtered['GT MVT']) / (max_value)) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gt_dma_monthly_filtered['GT MVT'] = gt_dma_monthly_filtered['GT MVT'].round(1)


In [29]:
gt_dma_monthly_filtered.to_csv("gt_dma_mvt_monthly_2017_2022.csv", index = False)

# Visualization

In [30]:
def visualization(df, date_column, geo_column, value_column):
    df = df.set_index([date_column])
    import matplotlib.pyplot as plt
    for geo_unit in df[geo_column].unique():
    # 3. Filter the data for each DMA if needed
    # Example: Filter data for a specific DMA
        plot_data = df[df[geo_column] == geo_unit]

        # 4. Plot the movement data over time for each DMA
        plt.figure(figsize=(10, 6))
        plot_data[value_column].plot(marker='o', linestyle='-')
        plt.title(f'Movement Data Over Time for {geo_column}: {geo_unit}')
        plt.xlabel(date_column)
        plt.ylabel(f'{value_column}')
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    

In [31]:
#visualization(gt_dma_monthly_filtered, 'year_month', 'DMA', 'GT MVT')

In [32]:
#visualization(gt_dma_annual_df, 'Year', 'DMA', 'GT MVT')

In [33]:
#visualization(gt_state_annual_mean_df, 'Year', 'State', 'GT MVT')