# Stacking Repeating Data in a Dataframe

This is an answer to the following Stackoverflow question:
- https://stackoverflow.com/q/62843577/1609514

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Here is a sample of your current data:
data = [['NY',     'Albany'],
        ['NY',     'NYC'],
        ['MA',     'Boston'],
        ['MA',     'Cambridge']]
cities = pd.DataFrame(data, columns=['State', 'City'])
print(cities)

  State       City
0    NY     Albany
1    NY        NYC
2    MA     Boston
3    MA  Cambridge


In [3]:
# First, make this into a multi-level index
# (this will end up in the final dataframe):
cities_index = pd.MultiIndex.from_frame(cities)
print(cities_index)

MultiIndex([('NY',    'Albany'),
            ('NY',       'NYC'),
            ('MA',    'Boston'),
            ('MA', 'Cambridge')],
           names=['State', 'City'])


In [4]:
# Now, make a dataframe with all the years in it:
years = list(range(2000, 2003))
n_cities = len(cities)
years_data = np.repeat(years, n_cities).reshape(len(years), n_cities).T
years_data = pd.DataFrame(years_data, index=cities_index)
years_data.columns.name = 'Year index'
print(years_data)

Year index          0     1     2
State City                       
NY    Albany     2000  2001  2002
      NYC        2000  2001  2002
MA    Boston     2000  2001  2002
      Cambridge  2000  2001  2002


In [5]:
# Finally, use stack to transform this dataframe into 
# a vertically-stacked series:
years_by_city = years_data.stack().rename('Year')
print(years_by_city.head())

State  City    Year index
NY     Albany  0             2000
               1             2001
               2             2002
       NYC     0             2000
               1             2001
Name: Year, dtype: int64


In [6]:
# Remove the index and put all the values in a dataframe
years_by_city.index

MultiIndex([('NY',    'Albany', 0),
            ('NY',    'Albany', 1),
            ('NY',    'Albany', 2),
            ('NY',       'NYC', 0),
            ('NY',       'NYC', 1),
            ('NY',       'NYC', 2),
            ('MA',    'Boston', 0),
            ('MA',    'Boston', 1),
            ('MA',    'Boston', 2),
            ('MA', 'Cambridge', 0),
            ('MA', 'Cambridge', 1),
            ('MA', 'Cambridge', 2)],
           names=['State', 'City', 'Year index'])

In [7]:
cities_and_years = years_by_city.reset_index()
cities_and_years

Unnamed: 0,State,City,Year index,Year
0,NY,Albany,0,2000
1,NY,Albany,1,2001
2,NY,Albany,2,2002
3,NY,NYC,0,2000
4,NY,NYC,1,2001
5,NY,NYC,2,2002
6,MA,Boston,0,2000
7,MA,Boston,1,2001
8,MA,Boston,2,2002
9,MA,Cambridge,0,2000


## Much Simpler Alternative

In [8]:
# (This is new in Pandas version 0.25.0)
cities['Year'] = [list(range(2000,2019))] * len(cities)
years_by_city = cities.explode('Year')
years_by_city

Unnamed: 0,State,City,Year
0,NY,Albany,2000
0,NY,Albany,2001
0,NY,Albany,2002
0,NY,Albany,2003
0,NY,Albany,2004
...,...,...,...
3,MA,Cambridge,2014
3,MA,Cambridge,2015
3,MA,Cambridge,2016
3,MA,Cambridge,2017
