In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Step 1: Load dataset
data = pd.read_csv("/content/weather_data.csv")

print("Data loaded successfully!")
print("Shape:", data.shape)
print("Columns:", data.columns.tolist())
print(data.head())

Data loaded successfully!
Shape: (1462, 5)
Columns: ['date', 'meantemp', 'humidity', 'wind_speed', 'meanpressure']
         date   meantemp   humidity  wind_speed  meanpressure
0  2013-01-01  10.000000  84.500000    0.000000   1015.666667
1  2013-01-02   7.400000  92.000000    2.980000   1017.800000
2  2013-01-03   7.166667  87.000000    4.633333   1018.666667
3  2013-01-04   8.666667  71.333333    1.233333   1017.166667
4  2013-01-05   6.000000  86.833333    3.700000   1016.500000


In [3]:
# Step 2: Basic preprocessing
# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

print("\nNumerical Columns:", numerical_cols.tolist())
print("Categorical Columns:", categorical_cols.tolist())


Numerical Columns: ['meantemp', 'humidity', 'wind_speed', 'meanpressure']
Categorical Columns: ['date']


In [4]:
# Step 3: Define transformers

# For numerical columns: handle missing values + scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# For categorical columns: handle missing + one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [5]:
# Step 4: Combine transformations
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [6]:
# Step 5: Create preprocessing pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])

In [7]:
# Step 6: Apply the pipeline to data
processed_data = pipeline.fit_transform(data)

# Convert processed data to DataFrame for easier use
processed_df = pd.DataFrame(
    processed_data.toarray() if hasattr(processed_data, "toarray") else processed_data
)
print("\nData transformed successfully!")
print("Processed shape:", processed_df.shape)
print(processed_df.head())


Data transformed successfully!
Processed shape: (1462, 1466)
       0         1         2         3     4     5     6     7     8     9     \
0 -2.109500  1.415439 -1.491699  0.025321   1.0   0.0   0.0   0.0   0.0   0.0   
1 -2.463454  1.862828 -0.838196  0.037162   0.0   1.0   0.0   0.0   0.0   0.0   
2 -2.495219  1.564569 -0.475626  0.041972   0.0   0.0   1.0   0.0   0.0   0.0   
3 -2.291015  0.630022 -1.221233  0.033647   0.0   0.0   0.0   1.0   0.0   0.0   
4 -2.654044  1.554627 -0.680303  0.029946   0.0   0.0   0.0   0.0   1.0   0.0   

   ...  1456  1457  1458  1459  1460  1461  1462  1463  1464  1465  
0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 1466 columns]


In [8]:
# Step 7: Train-Test Split (Optional)
# Example: If target column is 'Temperature'
if 'Temperature' in data.columns:
    X = data.drop('Temperature', axis=1)
    y = data['Temperature']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("\n✅ Train-Test Split done!")
    print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
