In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import json

In [3]:
df = pd.read_csv('main.csv')
df.head(5)

Unnamed: 0,username,id,media_type,like_count,comments_count
0,deparmedya,17990918969458700,IMAGE,6.0,0
1,beyazyakaliyiz,18219250732221000,VIDEO,22.0,1
2,kafesfirin,18311380465102300,VIDEO,19.0,0
3,vimerang,18089518138361500,VIDEO,19.0,1
4,totalenergies_istasyonlari,18012743929758400,VIDEO,21.0,0


In [4]:
df.shape

(5413, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5413 entries, 0 to 5412
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   username        5413 non-null   object 
 1   id              5413 non-null   int64  
 2   media_type      5413 non-null   object 
 3   like_count      5410 non-null   float64
 4   comments_count  5413 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 211.6+ KB


In [6]:
columns_to_drop = ['username']

In [7]:
def clean_data(df):
    return (
        df.drop(columns=columns_to_drop)
        )

In [8]:
df= clean_data(df)
df.head(5)

Unnamed: 0,id,media_type,like_count,comments_count
0,17990918969458700,IMAGE,6.0,0
1,18219250732221000,VIDEO,22.0,1
2,18311380465102300,VIDEO,19.0,0
3,18089518138361500,VIDEO,19.0,1
4,18012743929758400,VIDEO,21.0,0


In [9]:
df['media_type'].value_counts()

media_type
IMAGE             2754
VIDEO             1493
CAROUSEL_ALBUM    1166
Name: count, dtype: int64

In [10]:
df['media_type'] =df['media_type'].replace({
    'IMAGE':0,
    'VIDEO': 1,
    'CAROUSEL_ALBUM': 2
})

In [11]:
df

Unnamed: 0,id,media_type,like_count,comments_count
0,17990918969458700,0,6.0,0
1,18219250732221000,1,22.0,1
2,18311380465102300,1,19.0,0
3,18089518138361500,1,19.0,1
4,18012743929758400,1,21.0,0
...,...,...,...,...
5408,17977428839417500,2,16.0,0
5409,18003986413814100,2,14.0,0
5410,17846807598040300,2,14.0,0
5411,18270366805196600,2,14.0,0


In [12]:
df.duplicated().sum()

0

In [13]:
df.isnull().sum()

id                0
media_type        0
like_count        3
comments_count    0
dtype: int64

In [14]:
df.dropna(inplace = True)
df.isnull().sum()

id                0
media_type        0
like_count        0
comments_count    0
dtype: int64

In [15]:
df

Unnamed: 0,id,media_type,like_count,comments_count
0,17990918969458700,0,6.0,0
1,18219250732221000,1,22.0,1
2,18311380465102300,1,19.0,0
3,18089518138361500,1,19.0,1
4,18012743929758400,1,21.0,0
...,...,...,...,...
5408,17977428839417500,2,16.0,0
5409,18003986413814100,2,14.0,0
5410,17846807598040300,2,14.0,0
5411,18270366805196600,2,14.0,0


In [16]:
df['comments_count'].nunique()

346

In [17]:
df['comments_count'].unique()

array([    0,     1,     6,     2,     8,     4,     3,    53,     5,
         720, 12245,    14,    12, 20532,     7, 22338,    10,    47,
          22,   131,    19,    11,   197,    85,    16,     9,    35,
         191,    42,    23,    18,    80,    20,    28,    36,    15,
          60,    31,    32,    69,    58,   273,    66,   392,    84,
         107,   141,    48,    29,    39,    64,   120,   666,   914,
         116,   227,   391,    56,    27,    40,   332,    37,    13,
          24,    34,    70,    21,    25,   502,   363,   667,    89,
          73,    75,   399,   469,   339,   267,   506,   263,   366,
         153,   566,   398,    94,   209,   135,   488,   139,   226,
         210,   182,    90,   334,   148,   196,   569,   362,   155,
         147,   475,   354,   247,   178,   199,   597,  1009,    38,
          17,    83,    41,    65,   428,   562,    30,   319,    46,
          26,    97,   121,    50,    43,    49,    55,    71,    79,
         184,   289,

In [18]:
X = df[['id', 'comments_count', 'media_type']]
y = df['like_count']

In [19]:
# Define preprocessing for numeric and categorical features
numeric_features = ['id', 'comments_count']
categorical_features = ['media_type']

In [20]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [21]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Initialize models with preprocessing pipelines
models = {
    'Linear Regression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Random Forest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ]),
}

In [25]:
# Evaluate models
results = {}
for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    results[name] = {'MSE': mse, 'R2 Score': r2}

In [26]:
# Save predictions for the best model (e.g., Random Forest) as JSON
best_model = models['Random Forest']
predictions = best_model.predict(X_test)
output = {str(int(userid)): int(prediction) for userid, prediction in zip(X_test['id'], predictions)}

with open('regression-output.json', 'w') as f:
    json.dump(output, f)

In [27]:
for model, metrics in results.items():
    print(f"{model}: R2 Score = {metrics['R2 Score']:.2f}")

Linear Regression: R2 Score = 0.30
Random Forest: R2 Score = 0.88


In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import json

# Load training data
df = pd.read_csv('main.csv')

# Columns to drop
columns_to_drop = ['username']

def clean_data(df):
    return df.drop(columns=columns_to_drop)

df = clean_data(df)

# Encode media_type
df['media_type'] = df['media_type'].replace({
    'IMAGE': 0,
    'VIDEO': 1,
    'CAROUSEL_ALBUM': 2
})

# Drop duplicates and nulls
df.dropna(inplace=True)

# Feature and target selection
X = df[['id', 'comments_count', 'media_type']]
y = df['like_count']

# Define preprocessing for numeric and categorical features
numeric_features = ['id', 'comments_count']
categorical_features = ['media_type']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models with preprocessing pipelines
models = {
    'Linear Regression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Random Forest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ]),
}

# Evaluate models
results = {}
for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    results[name] = {'MSE': mse, 'R2 Score': r2}

# Save predictions for the best model (e.g., Random Forest)
best_model_name = max(results, key=lambda x: results[x]['R2 Score'])
best_model = models[best_model_name]

# Test with the external JSONL file
test_file_path = 'test-regression-round3.jsonl'
output_file_path = 'regression-output.json'

test_data = []
with open(test_file_path, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

# Prepare test set for prediction
test_df = pd.DataFrame(test_data)

# Encode media_type in the test set
test_df['media_type'] = test_df['media_type'].replace({
    'IMAGE': 0,
    'VIDEO': 1,
    'CAROUSEL_ALBUM': 2
})

# Use only necessary columns for prediction
X_test_external = test_df[['id', 'comments_count', 'media_type']]

# Make predictions
predictions = best_model.predict(X_test_external)

# Prepare the output dictionary
output = {str(int(row['id'])): int(round(pred)) for row, pred in zip(test_data, predictions)}

# Save predictions to the output file
with open(output_file_path, 'w') as f:
    json.dump(output, f, indent=2)

# Print evaluation results
for model, metrics in results.items():
    print(f"{model}: R2 Score = {metrics['R2 Score']:.2f}, MSE = {metrics['MSE']:.2f}")

print(f"Best model: {best_model_name}")
print(f"Predictions saved to {output_file_path}")

Linear Regression: R2 Score = 0.30, MSE = 1088312889.50
Random Forest: R2 Score = 0.88, MSE = 187070362.69
Best model: Random Forest
Predictions saved to regression-output.json
