In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam

This notebook aims to predict the number of new stargazers on the 31st day after the creation of a repository. It focuses on the top 5 repositories with the highest number of stargazers, tracking the growth of stargazers in the first 30 days since repository creation. The dataset includes the number of new stargazers for each day and the average betweenness centrality (BC) measure of all stargazers contributing to the repository on that day. Based on this information, the goal is to forecast the number of new stargazers on the 31st day after repository creation.

In [None]:
final_df = pd.read_csv('/content/drive/MyDrive/PMF 2 4/STROJNO + MREŽE/Projekt_v2/final_for_LSTM.csv')

In [None]:
final_df.head(5)

Unnamed: 0,full_name,date,stargazers_count,days_since_creation,average_bc
0,AmrDeveloper/ClangQL,2024-04-05,2,0,0.0
1,AmrDeveloper/ClangQL,2024-04-06,22,1,0.001326
2,AmrDeveloper/ClangQL,2024-04-07,79,2,0.001998
3,AmrDeveloper/ClangQL,2024-04-08,113,3,0.000545
4,AmrDeveloper/ClangQL,2024-04-09,75,4,0.002579


In [None]:
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from keras.layers import LSTM, Dropout, Dense

In [None]:
final_df = pd.read_csv('/content/drive/MyDrive/PMF 2 4/STROJNO + MREŽE/Projekt_v2/final_for_LSTM.csv')

In [None]:
def create_sequences(data, sequence_length):
    X = []
    y = []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length, -1])
    return np.array(X), np.array(y)

In [None]:
all_X = []
all_y = []

for repo in final_df['full_name'].unique():
    repo_data = final_df[final_df['full_name'] == repo]

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(repo_data[['days_since_creation', 'stargazers_count', 'average_bc']])

    X, y = create_sequences(scaled_data, sequence_length=30)

    all_X.append(X)
    all_y.append(y)

X = np.vstack(all_X)
y = np.concatenate(all_y)

In [None]:
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.3))
model.add(LSTM(50))
model.add(Dropout(0.3))
model.add(Dense(1))

X_train shape: (4, 30, 3)
X_test shape: (1, 30, 3)
y_train shape: (4,)
y_test shape: (1,)


In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stop], verbose=2)

predictions = model.predict(X_test)

predictions_rescaled = scaler.inverse_transform(np.concatenate((np.zeros((predictions.shape[0], scaled_data.shape[1]-1)), predictions), axis=1))[:, -1]
y_test_rescaled = scaler.inverse_transform(np.concatenate((np.zeros((y_test.shape[0], scaled_data.shape[1]-1)), y_test.reshape(-1, 1)), axis=1))[:, -1]

Epoch 1/100
1/1 - 6s - loss: 0.2063 - val_loss: 0.0164 - 6s/epoch - 6s/step
Epoch 2/100
1/1 - 0s - loss: 0.1277 - val_loss: 0.0474 - 75ms/epoch - 75ms/step
Epoch 3/100
1/1 - 0s - loss: 0.1511 - val_loss: 0.0953 - 63ms/epoch - 63ms/step
Epoch 4/100
1/1 - 0s - loss: 0.2108 - val_loss: 0.1273 - 59ms/epoch - 59ms/step
Epoch 5/100
1/1 - 0s - loss: 0.1712 - val_loss: 0.1263 - 58ms/epoch - 58ms/step
Epoch 6/100
1/1 - 0s - loss: 0.1423 - val_loss: 0.1222 - 85ms/epoch - 85ms/step
Epoch 7/100
1/1 - 0s - loss: 0.1771 - val_loss: 0.1088 - 60ms/epoch - 60ms/step
Epoch 8/100
1/1 - 0s - loss: 0.2056 - val_loss: 0.0965 - 59ms/epoch - 59ms/step
Epoch 9/100
1/1 - 0s - loss: 0.1154 - val_loss: 0.0885 - 75ms/epoch - 75ms/step
Epoch 10/100
1/1 - 0s - loss: 0.2541 - val_loss: 0.0807 - 59ms/epoch - 59ms/step
Epoch 11/100
1/1 - 0s - loss: 0.1932 - val_loss: 0.0731 - 112ms/epoch - 112ms/step


In [None]:
rmse = np.sqrt(mean_squared_error(y_test_rescaled, predictions_rescaled))
r2 = r2_score(y_test_rescaled, predictions_rescaled)
mape = mean_absolute_percentage_error(y_test_rescaled, predictions_rescaled)
print(f'RMSE: {rmse}')
print(f'R-squared (R2): {r2}')
print(f'MAPE: {mape}')

RMSE: 0.002545796825955642
R-squared (R2): nan
MAPE: 11465249636734.82


