In [1]:
import pandas as pd
import plotly
import sklearn
import math
import plotly.plotly as py
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


from scipy.special import softmax

import numpy as np

init_notebook_mode(connected=True)

In [2]:
def split_dataset(df, train_frac):
    # Shuffle
    df = sklearn.utils.shuffle(df)
    sz = len(df.index)
    return df[:int(sz * train_frac)], df[int(sz * train_frac):]

def clf_accuracy(results, labels):
    assert len(results) == len(labels)
    assert len(results)
    
    return  sum([int(labels[i] == results[i]) for i in range(len(results))]) / len(results)

### Задание 1

In [190]:
noisysine_df = pd.read_csv('noisysine.csv', sep=',', header=0)
noisysine_train, noisysine_test = split_dataset(noisysine_df, 0.8)

In [193]:
def simple_regression(X, y):
    X = np.array(X.values)
    y = np.array(y.values)
    
    w = np.linalg.pinv(X.T.dot(X)).dot(X.T).dot(y)
    return w
    
def predict_simple_regression(w, X):
    X = np.array(X.values)
    w = np.array(w)
    return X.dot(w)

In [192]:
noisysine_simple_df = noisysine_df.copy()

x = pd.DataFrame({'x' : np.linspace(0, 30, 200)})
data = []

for i in range(2, 6):
    noisysine_simple_df['x' + str(i)] = noisysine_simple_df['x'] ** i
    x['x' + str(i)] = x['x'] ** i
    noisysine_simple_train_df = noisysine_simple_df.loc[noisysine_train.index]
    noisysine_simple_test_df = noisysine_simple_df.loc[noisysine_test.index]
    
    real_y = noisysine_simple_test_df.sort_values(by=['x'])['y']
    pred_y = predict_simple_regression(
                simple_regression(noisysine_simple_train_df.drop(columns=['y']), noisysine_simple_train_df['y']),
                noisysine_simple_test_df.drop(columns=['y']).sort_values(by=['x']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))
    
    
    data.append(
    go.Scatter(
        y = predict_simple_regression(
            simple_regression(noisysine_simple_train_df.drop(columns=['y']), noisysine_simple_train_df['y']), x),
        x = x['x'],
        mode = 'lines',
        name = 'pred values: deg {}'.format(i)
    ))
data.append(
        go.Scatter(
            y = noisysine_simple_test_df.sort_values(by=['x'])['y'],
            x = noisysine_simple_test_df.sort_values(by=['x'])['x'],
            mode = 'markers',
            name = 'real values'
        ))
plotly.offline.iplot(data, filename='real_values.html')

Degree = 2 , r2 =  0.12861593614419853
Degree = 3 , r2 =  0.025406393703714447
Degree = 4 , r2 =  -0.030148188654944175
Degree = 5 , r2 =  0.8767939982729386


In [196]:
hydrodynamics_df = pd.read_csv('hydrodynamics.csv', sep=',', header=0)
hydrodynamics_train, hydrodynamics_test = split_dataset(hydrodynamics_df, 0.8)

In [197]:
hydrodynamics_simple_df = hydrodynamics_df.copy()

for i in [2]:
    columns = hydrodynamics_simple_df.drop(columns=['y']).columns
    
    for c1 in columns:
        for c2 in columns:
            hydrodynamics_simple_df[c1 + c2] = hydrodynamics_simple_df[c1] * hydrodynamics_simple_df[c2]
            
    hydrodynamics_simple_train_df = hydrodynamics_simple_df.loc[hydrodynamics_train.index]
    hydrodynamics_simple_test_df = hydrodynamics_simple_df.loc[hydrodynamics_test.index]
    
    real_y = hydrodynamics_simple_test_df['y']
    pred_y = predict_simple_regression(
                simple_regression(hydrodynamics_simple_train_df.drop(columns=['y']), hydrodynamics_simple_train_df['y']),
                hydrodynamics_simple_test_df.drop(columns=['y']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))

Degree = 2 , r2 =  0.9183287971422637


### Задание 2

In [198]:
def ridge_regression(X, y, alpha=0.001):
    X = np.array(X.values)
    y = np.array(y.values)
    
    w = np.linalg.inv(X.T.dot(X) - np.eye(X.T.shape[0]) * alpha).dot(X.T).dot(y)
    return w

Тут матрица обратима, отличные от 0 значени $\alpha$ только уменьшают R2.

In [199]:
noisysine_ridge_df = noisysine_df.copy()

x = pd.DataFrame({'x' : np.linspace(0, 30, 200)})
data = []

for i in range(2, 6):
    noisysine_ridge_df['x' + str(i)] = noisysine_ridge_df['x'] ** i
    x['x' + str(i)] = x['x'] ** i
    noisysine_ridge_train_df = noisysine_ridge_df.loc[noisysine_train.index]
    noisysine_ridge_test_df = noisysine_ridge_df.loc[noisysine_test.index]
    
    real_y = noisysine_ridge_test_df.sort_values(by=['x'])['y']
    pred_y = predict_simple_regression(
                ridge_regression(noisysine_ridge_train_df.drop(columns=['y']), noisysine_ridge_train_df['y'], 0.000001),
                noisysine_ridge_test_df.drop(columns=['y']).sort_values(by=['x']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))
    
    
    data.append(
        go.Scatter(
            y = predict_simple_regression(
                ridge_regression(noisysine_ridge_train_df.drop(columns=['y']), noisysine_ridge_train_df['y'], 0.000001), x),
            x = x['x'],
            mode = 'lines',
            name = 'pred values: deg {}'.format(i)
        ))

data.append(
        go.Scatter(
            y = noisysine_ridge_test_df.sort_values(by=['x'])['y'],
            x = noisysine_simple_test_df.sort_values(by=['x'])['x'],
            mode = 'markers',
            name = 'real values'
        ))
plotly.offline.iplot(data, filename='real_values.html')

Degree = 2 , r2 =  0.12861593611010347
Degree = 3 , r2 =  0.025406393390332238
Degree = 4 , r2 =  -0.03014819207294983
Degree = 5 , r2 =  0.8767940100286271


А вот тут можно выжать ещё доли процента.

In [200]:
hydrodynamics_ridge_df = hydrodynamics_df.copy()

for i in [2]:
    columns = hydrodynamics_ridge_df.drop(columns=['y']).columns
    
    for c1 in columns:
        for c2 in columns:
            hydrodynamics_ridge_df[c1 + c2] = hydrodynamics_ridge_df[c1] * hydrodynamics_ridge_df[c2]
            
    hydrodynamics_ridge_train_df = hydrodynamics_ridge_df.loc[hydrodynamics_train.index]
    hydrodynamics_ridge_test_df = hydrodynamics_ridge_df.loc[hydrodynamics_test.index]
    
    real_y = hydrodynamics_ridge_test_df['y']
    pred_y = predict_simple_regression(
                ridge_regression(hydrodynamics_ridge_train_df.drop(columns=['y']), hydrodynamics_ridge_train_df['y'], 0.000025),
                hydrodynamics_ridge_test_df.drop(columns=['y']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))

Degree = 2 , r2 =  0.9212201307918075


### Задание 3

In [201]:
noisysine_lasso_df = noisysine_df.copy()

x = pd.DataFrame({'x' : np.linspace(0, 30, 200)})
data = []

for i in range(2, 6):
    noisysine_lasso_df['x' + str(i)] = noisysine_lasso_df['x'] ** i
    x['x' + str(i)] = x['x'] ** i
    noisysine_lasso_train_df = noisysine_lasso_df.loc[noisysine_train.index]
    noisysine_lasso_test_df = noisysine_lasso_df.loc[noisysine_test.index]
    
    model = sklearn.linear_model.Lasso(alpha=0.002, max_iter=10000000, selection='cyclic')
    model.fit(noisysine_lasso_train_df.drop(columns=['y']), noisysine_lasso_train_df['y'])
    
    real_y = noisysine_lasso_test_df.sort_values(by=['x'])['y']
    pred_y = model.predict(noisysine_lasso_test_df.drop(columns=['y']).sort_values(by=['x']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y),
          "features used =", sum(map(lambda x: 1 if x > 0 else 0, model.coef_)))
    
    

    
    data.append(
        go.Scatter(
            y = model.predict(x),
            x = x['x'],
            mode = 'lines',
            name = 'pred values: deg {}'.format(i)
        ))

data.append(
        go.Scatter(
            y = noisysine_lasso_test_df.sort_values(by=['x'])['y'],
            x = noisysine_lasso_test_df.sort_values(by=['x'])['x'],
            mode = 'markers',
            name = 'real values'
        ))
plotly.offline.iplot(data, filename='real_values.html')

Degree = 2 , r2 =  0.3959652976869852 features used = 0
Degree = 3 , r2 =  0.45232397747904696 features used = 1
Degree = 4 , r2 =  0.05239805075468107 features used = 2
Degree = 5 , r2 =  0.8417272448897091 features used = 3


In [202]:
hydrodynamics_lasso_df = hydrodynamics_df.copy()

for i in [2]:
    columns = hydrodynamics_lasso_df.drop(columns=['y']).columns
    
    for c1 in columns:
        for c2 in columns:
            hydrodynamics_lasso_df[c1 + c2] = hydrodynamics_lasso_df[c1] * hydrodynamics_lasso_df[c2]
            
    hydrodynamics_lasso_train_df = hydrodynamics_lasso_df.loc[hydrodynamics_train.index]
    hydrodynamics_lasso_test_df = hydrodynamics_lasso_df.loc[hydrodynamics_test.index]
    
    model = sklearn.linear_model.Lasso(alpha=0.0072, max_iter=100000, selection='cyclic')
    model.fit(hydrodynamics_lasso_train_df.drop(columns=['y']), hydrodynamics_lasso_train_df['y'])        
    
    real_y = hydrodynamics_lasso_test_df['y']
    pred_y = model.predict(hydrodynamics_lasso_test_df.drop(columns=['y']))

    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y), 
          "features used =", sum(map(lambda x: 1 if x > 0 else 0, model.coef_)))

Degree = 2 , r2 =  0.902873621043669 features used = 10


### Задание 4

In [21]:
import random

In [22]:
tsp_df = pd.read_csv('tsp.csv', sep=',', header=0)

In [23]:
def draw_path(sequence, df):
    xs = []
    ys = []
    for j in sequence:
        cur_x, cur_y = df.iloc[j]['x'], df.iloc[j]['y']
        xs.append(cur_x)
        xs.append(cur_x)
        if ys:
            ys.append(cur_y)
        ys.append(cur_y)
        
    plotly.offline.iplot([
        go.Scatter(
            y = ys,
            x = xs[:-1],
            mode = 'markers+lines',
            name = 'path'
        )
        ], filename='real_values.html')

In [24]:
sequence = list(range(len(tsp_df.index)))
mk_best_path = None
mk_best_length = None

for i in range(1000):
    random.shuffle(sequence)
    cur_length = 0
    last_x, last_y = None, None
    for j in sequence:
        cur_x, cur_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
        if (last_x is not None and last_y is not None):
            cur_length += abs(last_x - cur_x) + abs(last_y - cur_y)
        last_x, last_y = cur_x, cur_y
    if (mk_best_length is None or mk_best_length > cur_length):
        mk_best_length = cur_length
        mk_best_path = list(sequence)

In [25]:
assert mk_best_path is not None
print("Best length:", mk_best_length)
draw_path(mk_best_path, tsp_df)

Best length: 30910


### Задание 5

In [27]:
rw_best_path = None
rw_best_length = None

def dist_to(x, y, j):
    return abs(tsp_df.iloc[j]['x'] - x) + abs(tsp_df.iloc[j]['y'] - y)

def choices(points, weights):
    r = random.random()
    s = sum(weights) * r
    cumsm = 0
    for i in range(len(points)):
        cumsm += weights[i]
        if cumsm >= s:
            return points[i]

for i in range(1000):
    sz = len(tsp_df.index)
    not_visited = set(range(sz))
    cur_length = 0
    last_x, last_y = None, None
    sequence = []
    for _ in range(sz):
        vs = list(not_visited)
        j = None
        if (last_x is not None and last_y is not None):
            j = choices(vs, weights=list(map(lambda x: 1.0 / dist_to(last_x, last_y, x), vs)))
        else:
            j = random.choice(vs)    
        not_visited.remove(j)
        sequence.append(j)
        cur_x, cur_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
        if (last_x is not None and last_y is not None):
            cur_length += abs(last_x - cur_x) + abs(last_y - cur_y)
        last_x, last_y = cur_x, cur_y
        
    if (rw_best_length is None or rw_best_length > cur_length):
        rw_best_length = cur_length
        rw_best_path = sequence

In [28]:
assert rw_best_path is not None
print("Best length:", rw_best_length)
draw_path(rw_best_path, tsp_df)

Best length: 20800


### Задание 6

In [29]:
hc_best_path = None
hc_best_length = None

def dist_to(x, y, j):
    return abs(tsp_df.iloc[j]['x'] - x) + abs(tsp_df.iloc[j]['y'] - y)

for i in range(len(tsp_df.index)):
    not_visited = set(range(sz))
    not_visited.remove(i)
    sequence = [i]
    
    cur_length = 0
    last_x, last_y = tsp_df.iloc[i]['x'], tsp_df.iloc[i]['y']
    for _ in range(sz - 1):
        new_x, new_y = None, None
        new_best = None
        new_j = None
        for j in not_visited:
            if new_best is None or dist_to(last_x, last_y, j) < new_best:
                new_best = dist_to(last_x, last_y, j)
                new_x, new_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
                new_j = j
        not_visited.remove(new_j)
        sequence.append(new_j)
        
        cur_length += new_best         
        last_x, last_y = new_x, new_y
        
    if (hc_best_length is None or hc_best_length > cur_length):
        hc_best_length = cur_length
        hc_best_path = list(sequence)

In [30]:
assert hc_best_path is not None
print("Best length:", hc_best_length)
draw_path(hc_best_path, tsp_df)

Best length: 9680


### Задание 7

In [203]:
from scipy.special import expit

sequence = random.shuffle(list(range(len(tsp_df.index))))
sa_best_path = None
sa_best_length = None

def seq_len(sequence):
    cur_length = 0
    last_x, last_y = None, None
    for j in sequence:
        cur_x, cur_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
        if (last_x is not None and last_y is not None):
            cur_length += abs(last_x - cur_x) + abs(last_y - cur_y)
        last_x, last_y = cur_x, cur_y
    return cur_length

sz = len(tsp_df.index)

for _ in range(1):
    lengths = list(range(2, sz))
    sequence = list(range(sz))
    random.shuffle(sequence)
    for i in range(1000):
        cur_length = seq_len(sequence)
        t = [math.pow(0.98, i * math.sqrt(j - 1)) for j in range(2, sz)]
        
        seqs = []
        
        for _ in range(100):
            swap_length = random.choices(lengths, weights=t)[0]
            r1 = random.randint(0, sz - swap_length)
            new_seq = sequence[:r1] + list(reversed(sequence[r1: r1 + swap_length])) + sequence[r1 + swap_length:]
            seqs.append(new_seq)
        
        new_lengths = [math.exp((cur_length - seq_len(new_seq)) / 1000) for new_seq in seqs]
        sequence = random.choices(seqs, weights=new_lengths)[0]
        
        cur_length = seq_len(sequence)

        if (sa_best_length is None or sa_best_length > cur_length):
            sa_best_length = cur_length
            sa_best_path = sequence

In [204]:
assert sa_best_path is not None
print("Best length:", sa_best_length)
draw_path(sa_best_path, tsp_df)

Best length: 21515


### Задание 8

In [135]:
def get_len(df, permut):
    l = 0
    last = None
    for i in permut:
        if last is not None:
            l += abs(df.iloc[i]['x'] - df.iloc[last]['x']) + abs(df.iloc[i]['y'] - df.iloc[last]['y'])
        last = i
    return l

def dist(i, j):
    return abs(tsp_df.iloc[j]['x'] - tsp_df.iloc[i]['x']) + abs(tsp_df.iloc[j]['y'] - tsp_df.iloc[i]['y'])


def genetic_algorithm_with_mutations(df, starting_pool=200, children=200, survival_rate=0.5, iters=200, mutation_rate=0.01):    
    ga_best_path = None
    ga_best_length = None
    ga_it = None
    
    sz = len(df.index)
    def merge_paths(p1, p2):        
        r1 = random.randint(0, sz - 1)
        r2 = random.randint(r1 + 1, sz)
        
        used = set(p1[r1:r2])
        
        i = 0
        cnt = 0
        path = []
        while cnt < r1:
            if p2[i] not in used:
                path.append(p2[i])
                cnt += 1
            i += 1
        path += p1[r1:r2]
        cnt = len(path)
        while cnt < sz:
            if p2[i] not in used:
                path.append(p2[i])
                cnt += 1
            i += 1
        
        return path
    
    def mutate(path):
        for i in range(len(path)):
            if random.random() < mutation_rate:
                w = random.randint(0, len(path) - 1)
                f = path[i]
                path[i] = path[w]
                path[w] = f
        return path
    
    samples = []
    for _ in range(starting_pool):
        yet_another = list(range(sz))
        random.shuffle(yet_another)
        samples.append(yet_another)
        
    for it in range(iters):
        ls = []
        
        # evaluation
        for sample in samples:
            cost = get_len(df, sample)
            if (ga_best_length is None or cost < ga_best_length):
                ga_it = it
                ga_best_length = cost
                ga_best_path = sample
            ls.append(cost)
        print("Iteration {}: average path length in generation: {}".format(it, sum(ls) / len(samples)))
        
        # selection
        new_samples = random.choices(samples, weights=ls, k=int(survival_rate * len(samples)))
        
        new_gen = []
        samples_sz = len(new_samples)
        # reproduction
        for _ in range(children):
            best_loc = None
            best_loc_p = None
            for _ in range(5):
                new_p = merge_paths(new_samples[random.randint(0, samples_sz - 1)], new_samples[random.randint(0, samples_sz - 1)])
                new_l = get_len(df, new_p)
                if best_loc is None or best_loc > new_l:
                    best_loc = new_l
                    best_loc_p = new_p
            new_gen.append(mutate(best_loc_p))
        samples = new_gen
        
    
    print(ga_it)
    return ga_best_length, ga_best_path

In [136]:
ga_best_length, ga_best_path = genetic_algorithm_with_mutations(tsp_df, starting_pool=1000, survival_rate=0.3, children=200, iters=100)

Iteration 0: average path length in generation: 37289.59
Iteration 1: average path length in generation: 35041.425
Iteration 2: average path length in generation: 33340.275
Iteration 3: average path length in generation: 31996.1
Iteration 4: average path length in generation: 30679.0
Iteration 5: average path length in generation: 29896.75
Iteration 6: average path length in generation: 29223.15
Iteration 7: average path length in generation: 28462.85
Iteration 8: average path length in generation: 27993.45
Iteration 9: average path length in generation: 27556.575
Iteration 10: average path length in generation: 27440.575
Iteration 11: average path length in generation: 27065.25
Iteration 12: average path length in generation: 26753.025
Iteration 13: average path length in generation: 26320.55
Iteration 14: average path length in generation: 25925.35
Iteration 15: average path length in generation: 25881.325
Iteration 16: average path length in generation: 25619.075
Iteration 17: avera

In [137]:
assert ga_best_path is not None
print("Best length:", ga_best_length)
draw_path(ga_best_path, tsp_df)

Best length: 13070
