In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import accuracy_score, r2_score
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor

import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
pth = './data.csv'
d = pd.read_csv(pth)

In [3]:
def string_to_float(feature):
    l = d[feature].unique()
    l_dict = dict(zip(l, np.arange(0.0, 1.0, 1/(len(l)+1))))
    return l_dict         

In [44]:
def preprocess(d): 
    clean_language = string_to_float('language')
    clean_license = string_to_float('license')
    d = d.drop(columns=['private', 'url', 'Unnamed: 0'])
    d['language'] = d['language'].apply(lambda x: clean_language[x])
    d['license'] = d['license'].apply(lambda x: clean_license[x])
    d = d.drop(columns=['watchers'])

    train_split = 0.9
    N = len(X)
    M = np.ceil(train_split*N).astype(int)
    idx = np.random.RandomState(seed=42).permutation(N)
    X_train, X_val = X.iloc[idx[:M]], X.iloc[idx[M:]]
    y_train, y_val = y.iloc[idx[:M]], y.iloc[idx[M:]]
    return X_train, X_val, y_train, y_val

In [134]:
def preprocess_ML(d): 
    clean_language = string_to_float('language')
    clean_license = string_to_float('license')
    d = d.drop(columns=['private', 'url', 'Unnamed: 0'])
    d['language'] = d['language'].apply(lambda x: clean_language[x])
    d['license'] = d['license'].apply(lambda x: clean_license[x])
    d = d.drop(columns=['watchers'])
    
    features = ['forks','language','size','subscribers','network','issues','pulls','commits','license']
    repo = d[features]
    stars = d['stars']
    
    return repo, stars

In [6]:
X_train, X_val, y_train, y_val = preprocess(d)

In [7]:
X_train.head()

Unnamed: 0,forks,language,size,subscribers,network,issues,pulls,commits,license
920,448,0.16,2635,188,448,52,4,643,0.5
525,2785,0.58,48191,646,2785,321,27,169,0.1
567,1835,0.0,592540,187,1835,303,1,2226,0.1
657,3259,0.34,33949,645,3259,98,24,1277,0.2
633,924,0.14,74610,231,924,388,19,6236,0.0


In [8]:
y_train.head()

920    13114
525    18211
567    17600
657    16255
633    16724
Name: stars, dtype: int64

In [9]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

In [10]:
def get_r2_score(model, X_val, y_val):
    predicted = model.predict(X_val)
    return r2_score(y_val, predicted)

In [11]:
def get_accuracy(model, X_val, y_val):
    predicted = model.predict(X_val)
    predicted_int = [round(x) for x in predicted] 
    print(f'Predicted: {predicted_int[:5]}\nTrue:{y_val.to_list()[:5]}')
    return accuracy_score(y_val, predicted_int)

# Models

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)
print(f'R2 score is: {get_r2_score(model, X_val, y_val)}')
print(f'Accuracy is: {get_accuracy(model, X_val, y_val)}')

R2 score is: 0.7278331547856001
Predicted: [17258, 20291, 28409, 17678, 18118]
True:[13696, 25442, 29234, 34058, 17695]
Accuracy is: 0.0


In [13]:
filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))

In [14]:
!ls

data.csv    get_data.py ml.ipynb    model.sav


In [18]:
model = BaggingRegressor(base_estimator=LinearRegression())
train_model(model, X_train, y_train)
print(f'R2 score is: {get_r2_score(model, X_val, y_val)}')
print(f'Accuracy is: {get_accuracy(model, X_val, y_val)}')

R2 score is: 0.7249964491837965
Predicted: [17184, 20232, 28222, 17468, 18128]
True:[13696, 25442, 29234, 34058, 17695]
Accuracy is: 0.0


In [19]:
model = Ridge()
train_model(model, X_train, y_train)
print(f'R2 score is: {get_r2_score(model, X_val, y_val)}')
print(f'Accuracy is: {get_accuracy(model, X_val, y_val)}')

R2 score is: 0.7278525514205273
Predicted: [17171, 20481, 28363, 18075, 18084]
True:[13696, 25442, 29234, 34058, 17695]
Accuracy is: 0.0


In [20]:
model = BayesianRidge()
train_model(model, X_train, y_train)
print(f'R2 score is: {get_r2_score(model, X_val, y_val)}')
print(f'Accuracy is: {get_accuracy(model, X_val, y_val)}')

R2 score is: 0.7108585649158248
Predicted: [14658, 24671, 26186, 24851, 17354]
True:[13696, 25442, 29234, 34058, 17695]
Accuracy is: 0.010101010101010102


In [21]:
n_neighbors = 5
model = KNeighborsRegressor(n_neighbors,weights='uniform')
train_model(model, X_train, y_train)
print(f'R2 score is: {get_r2_score(model, X_val, y_val)}')
print(f'Accuracy is: {get_accuracy(model, X_val, y_val)}')

R2 score is: 0.2012440935454889
Predicted: [17576, 34659, 25606, 118894, 19346]
True:[13696, 25442, 29234, 34058, 17695]
Accuracy is: 0.0


In [22]:
model = DecisionTreeRegressor(max_depth=100)
train_model(model, X_train, y_train)
print(f'R2 score is: {get_r2_score(model, X_val, y_val)}')
print(f'Accuracy is: {get_accuracy(model, X_val, y_val)}')

R2 score is: 0.32206510990397885
Predicted: [14331, 19158, 25957, 21420, 29069]
True:[13696, 25442, 29234, 34058, 17695]
Accuracy is: 0.0


# NN 
WIP

In [152]:
class CustomDataset(Dataset):
    def __init__(self, repo, stars):
        self.repo = repo
        self.stars = stars
        
    def __len__(self):
        return len(self.stars)

    def __getitem__(self, idx):
        repo = self.repo.iloc[idx].to_numpy()
        stars = self.stars.iloc[idx]
        return repo, stars

In [153]:
repo, stars = preprocess_ML(d)
training_data = CustomDataset(repo, stars)

bs = 64
train_dl = DataLoader(training_data, batch_size=bs, shuffle=True)

In [154]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(9, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, 1)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
#print(model)

In [155]:
def r2_loss(output, target):
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)
    ss_res = torch.sum((target - output) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

In [156]:
# r2_loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [157]:
def train(dataloader, model, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = r2_loss(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            #print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [158]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [159]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, optimizer)
    test(test_dl, model)
print("Done!")

Epoch 1
-------------------------------


RuntimeError: expected scalar type Float but found Double