In [31]:
import pandas as pd

# Load CSV
data = pd.read_csv('weatherHistory.csv')

# Ensure 'Formatted Date' column is of string type and drop NaNs
data = data.dropna(subset=['Formatted Date'])  # Remove NaN rows
data['Formatted Date'] = data['Formatted Date'].astype(str)

# Splitting Date, Time, and Timezone
split_data = data['Formatted Date'].str.split(' ', expand=True)

# Handling cases where Timezone is missing (optional)
if split_data.shape[1] == 3:  # Expected format (Date, Time.Milliseconds, Timezone)
    data[['date', 'time_ms', 'timezone']] = split_data
elif split_data.shape[1] == 2:  # If timezone is missing
    data[['date', 'time_ms']] = split_data
    data['timezone'] = None  # Assign NaN to missing timezones

# Splitting Time and Milliseconds
data[['time', 'milliseconds']] = data['time_ms'].str.split('.', expand=True)

# Dropping unnecessary columns
data = data.drop(columns=['Formatted Date', 'time_ms'])

# Display result
print(data.head())


         Summary Precip Type  Temperature (C)  Apparent Temperature (C)  \
0  Partly Cloudy        rain         9.472222                  7.388889   
1  Partly Cloudy        rain         9.355556                  7.227778   
2  Mostly Cloudy        rain         9.377778                  9.377778   
3  Partly Cloudy        rain         8.288889                  5.944444   
4  Mostly Cloudy        rain         8.755556                  6.977778   

   Humidity  Wind Speed (km/h)  Wind Bearing (degrees)  Visibility (km)  \
0      0.89            14.1197                   251.0          15.8263   
1      0.86            14.2646                   259.0          15.8263   
2      0.89             3.9284                   204.0          14.9569   
3      0.83            14.1036                   269.0          15.8263   
4      0.83            11.0446                   259.0          15.8263   

   Loud Cover  Pressure (millibars)                      Daily Summary  \
0         0.0           

In [28]:


print(data['Precip Type'].nunique())
print(data['Summary'].nunique())
print(data['Daily Summary'].nunique())
print(data['timezone'].nunique())
print(data['time'].nunique())
print(data['milliseconds'].nunique())


2
27
214
2
24
1


In [22]:
data.head()

Unnamed: 0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,date,timezone,time,milliseconds
0,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,2006-04-01,200,00:00:00,0
1,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,2006-04-01,200,01:00:00,0
2,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,2006-04-01,200,02:00:00,0
3,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,2006-04-01,200,03:00:00,0
4,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,2006-04-01,200,04:00:00,0


In [23]:
data.shape

(96453, 15)

In [24]:
d=pd.read_csv('weatherHistory.csv')

In [26]:
d.shape

(96453, 12)

In [27]:
data['Precip Type'].nunique()

2

In [29]:
dd=data.drop(columns=['milliseconds'])
dd.columns

Index(['Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)',
       'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)',
       'Visibility (km)', 'Loud Cover', 'Pressure (millibars)',
       'Daily Summary', 'date', 'timezone', 'time'],
      dtype='object')

In [36]:

from ctgan import CTGAN  # Corrected import

from sklearn.preprocessing import LabelEncoder


In [32]:
columns_to_use = ['Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)',
                  'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)',
                  'Visibility (km)', 'Loud Cover', 'Pressure (millibars)', 'Daily Summary']

df = data[columns_to_use]

In [37]:
ctgan = CTGAN(epochs=100, batch_size=500)

In [38]:
categorical_columns = ['Summary', 'Precip Type', 'Daily Summary']
for col in categorical_columns:
    df[col] = df[col].astype(str)  # Ensure all are strings
    df[col] = LabelEncoder().fit_transform(df[col])  # Convert to numbers

# Display data info
print(df.head())

   Summary  Precip Type  Temperature (C)  Apparent Temperature (C)  Humidity  \
0       19            1         9.472222                  7.388889      0.89   
1       19            1         9.355556                  7.227778      0.86   
2       17            1         9.377778                  9.377778      0.89   
3       19            1         8.288889                  5.944444      0.83   
4       17            1         8.755556                  6.977778      0.83   

   Wind Speed (km/h)  Wind Bearing (degrees)  Visibility (km)  Loud Cover  \
0            14.1197                   251.0          15.8263         0.0   
1            14.2646                   259.0          15.8263         0.0   
2             3.9284                   204.0          14.9569         0.0   
3            14.1036                   269.0          15.8263         0.0   
4            11.0446                   259.0          15.8263         0.0   

   Pressure (millibars)  Daily Summary  
0              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)  # Ensure all are strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = LabelEncoder().fit_transform(df[col])  # Convert to numbers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)  # Ensure all are strings
A value is trying to be

In [39]:
# Fit model to the dataset
ctgan.fit(df, categorical_columns)


KeyboardInterrupt: 