In [1]:
import pandas as pd
import plotly
import sklearn
import math
import plotly.plotly as py
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


from scipy.special import softmax

import numpy as np

init_notebook_mode(connected=True)

In [2]:
def split_dataset(df, train_frac):
    # Shuffle
    df = sklearn.utils.shuffle(df)
    sz = len(df.index)
    return df[:int(sz * train_frac)], df[int(sz * train_frac):]

def clf_accuracy(results, labels):
    assert len(results) == len(labels)
    assert len(results)
    
    return  sum([int(labels[i] == results[i]) for i in range(len(results))]) / len(results)

### Задание 1

In [3]:
noisysine_df = pd.read_csv('noisysine.csv', sep=',', header=0)
noisysine_train, noisysine_test = split_dataset(noisysine_df, 0.8)

In [20]:
def simple_regression(X, y):
    X = np.array(X.values)
    y = np.array(y.values)
    
    w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
    return w
    
def predict_simple_regression(w, X):
    X = np.array(X.values)
    w = np.array(w)
    return X.dot(w)

In [21]:
noisysine_simple_df = noisysine_df.copy()

for i in range(2, 6):
    noisysine_simple_df['x' + str(i)] = noisysine_simple_df['x'] ** i
    noisysine_simple_train_df = noisysine_simple_df.loc[noisysine_train.index]
    noisysine_simple_test_df = noisysine_simple_df.loc[noisysine_test.index]
    
    real_y = noisysine_simple_test_df.sort_values(by=['x'])['y']
    pred_y = predict_simple_regression(
                simple_regression(noisysine_simple_train_df.drop(columns=['y']), noisysine_simple_train_df['y']),
                noisysine_simple_test_df.drop(columns=['y']).sort_values(by=['x']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))
    
    plotly.offline.iplot([
        go.Scatter(
            y = real_y,
            x = noisysine_simple_test_df.sort_values(by=['x'])['x'],
            mode = 'markers+lines',
            name = 'real values'
        ),
        go.Scatter(
            y = pred_y,
            x = sorted(noisysine_simple_test_df['x']),
            mode = 'markers+lines',
            name = 'pred values'
        ),
        ], filename='real_values.html')

Degree = 2 , r2 =  0.08980941568933853


Degree = 3 , r2 =  0.09160806354810858


Degree = 4 , r2 =  0.12814303830480978


Degree = 5 , r2 =  0.9058064864992379


In [6]:
hydrodynamics_df = pd.read_csv('hydrodynamics.csv', sep=',', header=0)
hydrodynamics_train, hydrodynamics_test = split_dataset(hydrodynamics_df, 0.8)

In [7]:
hydrodynamics_simple_df = hydrodynamics_df.copy()

for i in [2]:
    columns = hydrodynamics_simple_df.drop(columns=['y']).columns
    
    for c1 in columns:
        for c2 in columns:
            hydrodynamics_simple_df[c1 + c2] = hydrodynamics_simple_df[c1] * hydrodynamics_simple_df[c2]
            
    hydrodynamics_simple_train_df = hydrodynamics_simple_df.loc[hydrodynamics_train.index]
    hydrodynamics_simple_test_df = hydrodynamics_simple_df.loc[hydrodynamics_test.index]
    
    real_y = hydrodynamics_simple_test_df['y']
    pred_y = predict_simple_regression(
                simple_regression(hydrodynamics_simple_train_df.drop(columns=['y']), hydrodynamics_simple_train_df['y']),
                hydrodynamics_simple_test_df.drop(columns=['y']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))

Degree = 2 , r2 =  0.9195476583157262


### Задание 2

In [8]:
def ridge_regression(X, y, alpha=0.001):
    X = np.array(X.values)
    y = np.array(y.values)
    
    w = np.linalg.inv(X.T.dot(X) - np.eye(X.T.shape[0]) * alpha).dot(X.T).dot(y)
    return w

Тут матрица обратима, отличные от 0 значени $\alpha$ только уменьшают R2.

In [27]:
noisysine_ridge_df = noisysine_df.copy()

for i in range(2, 6):
    noisysine_ridge_df['x' + str(i)] = noisysine_ridge_df['x'] ** i
    noisysine_ridge_train_df = noisysine_ridge_df.loc[noisysine_train.index]
    noisysine_ridge_test_df = noisysine_ridge_df.loc[noisysine_test.index]
    
    real_y = noisysine_ridge_test_df.sort_values(by=['x'])['y']
    pred_y = predict_simple_regression(
                ridge_regression(noisysine_ridge_train_df.drop(columns=['y']), noisysine_ridge_train_df['y'], 0.000001),
                noisysine_ridge_test_df.drop(columns=['y']).sort_values(by=['x']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))
    
    plotly.offline.iplot([
        go.Scatter(
            y = real_y,
            x = noisysine_ridge_test_df.sort_values(by=['x'])['x'],
            mode = 'markers+lines',
            name = 'real values'
        ),
        go.Scatter(
            y = pred_y,
            x = sorted(noisysine_ridge_test_df['x']),
            mode = 'markers+lines',
            name = 'pred values'
        ),
        ], filename='real_values.html')

Degree = 2 , r2 =  0.0898094157071262


Degree = 3 , r2 =  0.09160806354120388


Degree = 4 , r2 =  0.12814303895441537


Degree = 5 , r2 =  0.9058064710499513


А вот тут можно выжать ещё доли процента.

In [29]:
hydrodynamics_ridge_df = hydrodynamics_df.copy()

for i in [2]:
    columns = hydrodynamics_ridge_df.drop(columns=['y']).columns
    
    for c1 in columns:
        for c2 in columns:
            hydrodynamics_ridge_df[c1 + c2] = hydrodynamics_ridge_df[c1] * hydrodynamics_ridge_df[c2]
            
    hydrodynamics_ridge_train_df = hydrodynamics_ridge_df.loc[hydrodynamics_train.index]
    hydrodynamics_ridge_test_df = hydrodynamics_ridge_df.loc[hydrodynamics_test.index]
    
    real_y = hydrodynamics_ridge_test_df['y']
    pred_y = predict_simple_regression(
                ridge_regression(hydrodynamics_ridge_train_df.drop(columns=['y']), hydrodynamics_ridge_train_df['y'], 0.000025),
                hydrodynamics_ridge_test_df.drop(columns=['y']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y))

Degree = 2 , r2 =  0.9211620395493217


### Задание 3

In [11]:
noisysine_lasso_df = noisysine_df.copy()

for i in range(2, 6):
    noisysine_lasso_df['x' + str(i)] = noisysine_lasso_df['x'] ** i
    noisysine_lasso_train_df = noisysine_lasso_df.loc[noisysine_train.index]
    noisysine_lasso_test_df = noisysine_lasso_df.loc[noisysine_test.index]
    
    model = sklearn.linear_model.Lasso(alpha=0.002, max_iter=10000000, selection='cyclic')
    model.fit(noisysine_lasso_train_df.drop(columns=['y']), noisysine_lasso_train_df['y'])
    
    real_y = noisysine_lasso_test_df.sort_values(by=['x'])['y']
    pred_y = model.predict(noisysine_lasso_test_df.drop(columns=['y']).sort_values(by=['x']))
    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y),
          "features used =", sum(map(lambda x: 1 if x > 0 else 0, model.coef_)))
    
    plotly.offline.iplot([
        go.Scatter(
            y = real_y,
            x = noisysine_lasso_test_df.sort_values(by=['x'])['x'],
            mode = 'markers+lines',
            name = 'real values'
        ),
        go.Scatter(
            y = pred_y,
            x = sorted(noisysine_lasso_test_df['x']),
            mode = 'markers+lines',
            name = 'pred values'
        ),
        ], filename='real_values.html')

Degree = 2 , r2 =  0.2562725537816791 features used = 1


Degree = 3 , r2 =  0.41764868365808305 features used = 1


Degree = 4 , r2 =  0.3890777715357262 features used = 2


Degree = 5 , r2 =  0.8688702521562265 features used = 3


In [12]:
hydrodynamics_lasso_df = hydrodynamics_df.copy()

for i in [2]:
    columns = hydrodynamics_lasso_df.drop(columns=['y']).columns
    
    for c1 in columns:
        for c2 in columns:
            hydrodynamics_lasso_df[c1 + c2] = hydrodynamics_lasso_df[c1] * hydrodynamics_lasso_df[c2]
            
    hydrodynamics_lasso_train_df = hydrodynamics_lasso_df.loc[hydrodynamics_train.index]
    hydrodynamics_lasso_test_df = hydrodynamics_lasso_df.loc[hydrodynamics_test.index]
    
    model = sklearn.linear_model.Lasso(alpha=0.0072, max_iter=100000, selection='cyclic')
    model.fit(hydrodynamics_lasso_train_df.drop(columns=['y']), hydrodynamics_lasso_train_df['y'])        
    
    real_y = hydrodynamics_lasso_test_df['y']
    pred_y = model.predict(hydrodynamics_lasso_test_df.drop(columns=['y']))

    
    print("Degree =", i, ", r2 = ", sklearn.metrics.r2_score(real_y, pred_y), 
          "features used =", sum(map(lambda x: 1 if x > 0 else 0, model.coef_)))

Degree = 2 , r2 =  0.8849071931241844 features used = 9


### Задание 4

In [13]:
import random

In [37]:
tsp_df = pd.read_csv('tsp.csv', sep=',', header=0)

In [15]:
def draw_path(sequence, df):
    xs = []
    ys = []
    for j in sequence:
        cur_x, cur_y = df.iloc[j]['x'], df.iloc[j]['y']
        xs.append(cur_x)
        xs.append(cur_x)
        if ys:
            ys.append(cur_y)
        ys.append(cur_y)
        
    plotly.offline.iplot([
        go.Scatter(
            y = ys,
            x = xs[:-1],
            mode = 'markers+lines',
            name = 'path'
        )
        ], filename='real_values.html')

In [16]:
sequence = list(range(len(tsp_df.index)))
mk_best_path = None
mk_best_length = None

for i in range(1000):
    random.shuffle(sequence)
    cur_length = 0
    last_x, last_y = None, None
    for j in sequence:
        cur_x, cur_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
        if (last_x is not None and last_y is not None):
            cur_length += abs(last_x - cur_x) + abs(last_y - cur_y)
        last_x, last_y = cur_x, cur_y
    if (mk_best_length is None or mk_best_length > cur_length):
        mk_best_length = cur_length
        mk_best_path = list(sequence)

In [17]:
assert mk_best_path is not None
print("Best length:", mk_best_length)
draw_path(mk_best_path, tsp_df)

Best length: 31240


### Задание 5

In [34]:
rw_best_path = None
rw_best_length = None

def dist_to(x, y, j):
    return abs(tsp_df.iloc[j]['x'] - x) + abs(tsp_df.iloc[j]['y'] - y)

def choices(points, weights):
    r = random.random()
    s = sum(weights) * r
    cumsm = 0
    for i in range(len(points)):
        cumsm += weights[i]
        if cumsm >= s:
            return points[i]

for i in range(1000):
    sz = len(tsp_df.index)
    not_visited = set(range(sz))
    cur_length = 0
    last_x, last_y = None, None
    sequence = []
    for _ in range(sz):
        vs = list(not_visited)
        j = None
        if (last_x is not None and last_y is not None):
            j = choices(vs, weights=list(map(lambda x: 1.0 / dist_to(last_x, last_y, x), vs)))
        else:
            j = random.choice(vs)    
        not_visited.remove(j)
        sequence.append(j)
        cur_x, cur_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
        if (last_x is not None and last_y is not None):
            cur_length += abs(last_x - cur_x) + abs(last_y - cur_y)
        last_x, last_y = cur_x, cur_y
        
    if (rw_best_length is None or rw_best_length > cur_length):
        rw_best_length = cur_length
        rw_best_path = sequence

In [35]:
assert rw_best_path is not None
print("Best length:", rw_best_length)
draw_path(rw_best_path, tsp_df)

Best length: 20960


### Задание 6

In [32]:
hc_best_path = None
hc_best_length = None

def dist_to(x, y, j):
    return abs(tsp_df.iloc[j]['x'] - x) + abs(tsp_df.iloc[j]['y'] - y)

for i in range(len(tsp_df.index)):
    not_visited = set(range(sz))
    not_visited.remove(i)
    sequence = [i]
    
    cur_length = 0
    last_x, last_y = tsp_df.iloc[i]['x'], tsp_df.iloc[i]['y']
    for _ in range(sz - 1):
        new_x, new_y = None, None
        new_best = None
        new_j = None
        for j in not_visited:
            if new_best is None or dist_to(last_x, last_y, j) < new_best:
                new_best = dist_to(last_x, last_y, j)
                new_x, new_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
                new_j = j
        not_visited.remove(new_j)
        sequence.append(new_j)
        
        cur_length += new_best         
        last_x, last_y = new_x, new_y
        
    if (hc_best_length is None or hc_best_length > cur_length):
        hc_best_length = cur_length
        hc_best_path = list(sequence)

In [33]:
assert hc_best_path is not None
print("Best length:", hc_best_length)
draw_path(hc_best_path, tsp_df)

Best length: 9680


### Задание 7

In [51]:
from scipy.special import expit

sequence = random.shuffle(list(range(len(tsp_df.index))))
sa_best_path = None
sa_best_length = None

def seq_len(sequence):
    cur_length = 0
    last_x, last_y = None, None
    for j in sequence:
        cur_x, cur_y = tsp_df.iloc[j]['x'], tsp_df.iloc[j]['y']
        if (last_x is not None and last_y is not None):
            cur_length += abs(last_x - cur_x) + abs(last_y - cur_y)
        last_x, last_y = cur_x, cur_y
    return cur_length

sz = len(tsp_df.index)

for _ in range(100):
    sequence = list(range(sz))
    random.shuffle(sequence)
    for i in range(50):
        cur_length = seq_len(sequence)
        r1 = random.randint(0, sz - 1)
        r2 = random.randint(r1 + 1, sz)
        new_seq = sequence[:r1] + list(reversed(sequence[r1:r2])) + sequence[r2:]
        new_length = seq_len(new_seq)
        
        if (expit(new_length - cur_length) / (math.sqrt(i + 1))  > random.random()):
            sequence = new_seq
    cur_length = seq_len(sequence)
    
    if (sa_best_length is None or sa_best_length > cur_length):
        sa_best_length = cur_length
        sa_best_path = sequence

In [52]:
assert sa_best_path is not None
print("Best length:", sa_best_length)
draw_path(sa_best_path, tsp_df)

Best length: 34480
