### **Instalação do surprise**

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 232kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617631 sha256=0ae11cc9917c7550bb5369557181a4b42784ab057e4a9b6e2e44b77ddceec911
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


### **Importações**

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

### **Carregamento do dataset**

In [28]:
dataset = pd.read_csv('/content/data.csv', header=None, names = ['user_id','question','rating'])
df = dataset.head(100000)
print(df.head(4))
print(df.shape)
df.isnull().sum()

   user_id    question  rating
0   276726   155061224       5
1   276729  052165615X       3
2   276729   521795028       6
3   276736  3257224281       8
(100000, 3)


user_id     0
question    0
rating      0
dtype: int64

### **Recomendação baseada em popularidade**

In [29]:
new_df=df.groupby("question").filter(lambda x:x['rating'].count() >=50)
print(new_df.head())
ratings_df = pd.DataFrame(new_df.groupby('question').rating.mean())
ratings_df['rating_counts'] = new_df.groupby('question').rating.count()
ratings_df.sort_values(by='rating_counts', ascending=False).head(5)

     user_id    question  rating
210   276925   385504209       8
284   276953   446310786      10
379   277042   971880107       2
573   277212  044023722X       8
594   277246   452282152      10


Unnamed: 0_level_0,rating,rating_counts
question,Unnamed: 1_level_1,Unnamed: 2_level_1
316666343,8.17284,162
971880107,4.484375,128
385504209,8.747664,107
60928336,7.678161,87
312195516,8.523256,86


In [30]:
#avaliação média
C = ratings_df['rating'].mean()
print(C)

#minimo de avaliações
m = ratings_df.rating_counts.min(); m
print(m)

#v = n avaliações de uma questao
#R = avaliação média da questao

def weighted_rating(x, m=m, C=C):
    v = x['rating_counts']
    R = x['rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

7.881765932988788
50


In [31]:
ratings_df['score'] = ratings_df.apply(weighted_rating, axis=1)
ratings_df.sort_values(by='score', ascending=False).head(15)

Unnamed: 0_level_0,rating,rating_counts,score
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
059035342X,9.048387,62,8.527574
385504209,8.747664,107,8.4719
446310786,8.949153,59,8.459526
142001740,8.828125,64,8.413055
312195516,8.523256,86,8.287414
679781587,8.446154,65,8.200768
316666343,8.17284,162,8.10419
671027360,8.169492,59,8.037507
743418174,8.157895,57,8.028863
446672211,8.125,72,8.025314


### **Filtragem colaborativa item-item**

In [32]:
reader = Reader()
data = Dataset.load_from_df(new_df,reader)

trainset, testset = train_test_split(data, test_size=0.2,random_state=10)
algo = KNNWithMeans(k=5, sim_options={'user_based': False})
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f761adcd550>

In [35]:
algo.get_neighbors(3, 15)

[11, 13, 12, 14, 18, 20, 5, 6, 19, 4, 2, 8, 10, 17, 23]

In [34]:
ratings_df.iloc[[14, 20, 13, 15, 11, 18, 21, 17, 10, 6, 5, 23, 4, 16, 3]].index

Index(['446605239', '679781587', '446310786', '446672211', '385504209',
       '671003755', '743418174', '60928336', '375727345', '316601950',
       '312195516', '971880107', '142001740', '452282152', '067976402X'],
      dtype='object', name='question')

### **Sistema de filtragem colaborativa baseado em modelo**

In [36]:
new_df1=new_df.head(20000)
ratings_matrix = new_df1.pivot_table(values='rating', index='user_id', columns='question', fill_value=0)
ratings_matrix.head()

question,044021145X,044023722X,059035342X,067976402X,142001740,312195516,316601950,316666343,345370775,345417623,375727345,385504209,440226430,446310786,446605239,446672211,452282152,60928336,671003755,671027360,679781587,743418174,786868716,971880107
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
26,0,0,0,0,0,0,0,0,0,0,0,0.0,0,10,0,0,0,0,0,0,0,0,0,0
114,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,10,0,0,0,0
243,0,7,0,0,0,0,9,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
254,0,0,0,0,9,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,8,0,0,0
256,0,8,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
print(ratings_matrix.shape)
X = ratings_matrix.T
X.head()

(1274, 24)


user_id,26,114,243,254,256,507,595,638,709,763,805,882,899,900,936,1008,1025,1031,1032,1075,1131,1211,1249,1254,1261,1376,1433,1435,1486,1504,1530,1535,1548,1652,1718,1733,1768,1803,1805,1848,...,58389,58515,276925,276953,277042,277212,277246,277358,277378,277427,277439,277462,277517,277535,277743,277752,277803,277873,277901,277903,277938,277958,277965,277997,278007,278075,278107,278122,278162,278176,278220,278254,278350,278390,278422,278541,278543,278633,278843,278844
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
044021145X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
044023722X,0.0,0.0,7.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
059035342X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
067976402X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142001740,0.0,0.0,0.0,9.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0


In [38]:
SVD_model = TruncatedSVD(n_components=10)
decomposed_matrix = SVD_model.fit_transform(X)
decomposed_matrix.shape

(24, 10)

### **Matriz de correlação: semelhança entre questões (0-1)**

In [39]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(24, 24)

In [40]:
correlation_matrix

array([[ 1.00000000e+00,  7.28757445e-01,  2.36139297e-01,
         5.67297833e-01,  1.02266136e-01,  9.32476081e-02,
         8.55712811e-01,  1.49801844e-01,  6.21444758e-01,
         7.87051808e-01,  5.80591281e-01, -8.98471658e-02,
         5.25834902e-01,  3.52635569e-01,  4.36820577e-01,
        -1.85327146e-01,  4.01804574e-01,  6.39957337e-01,
         3.64192062e-01,  1.87170951e-01,  2.99178529e-01,
         2.88552512e-01,  4.45555306e-01,  2.09142628e-01],
       [ 7.28757445e-01,  1.00000000e+00, -1.60662286e-01,
         1.76371366e-01,  5.06820511e-02, -5.69611650e-02,
         7.44310246e-01,  1.40315150e-02,  6.91781619e-01,
         9.04234497e-01,  5.11292059e-01, -2.18268400e-01,
         5.49344066e-01,  3.77403299e-02,  3.23106810e-01,
        -2.10332324e-01,  4.79515908e-01,  2.70452376e-01,
        -3.14748655e-02,  2.95859100e-01, -1.74689035e-01,
         3.05382772e-01,  5.29593751e-01, -3.36550664e-02],
       [ 2.36139297e-01, -1.60662286e-01,  1.00000000e

In [41]:
i = "044023722X"

questions_name = list(X.index)
question_ID = questions_name.index(i)
question_ID

1

In [42]:
correlation_question_ID = correlation_matrix[question_ID]
correlation_question_ID.shape

(24,)

In [43]:
Recommend = list(X.index[correlation_question_ID > 0.50])
Recommend.remove(i) 
Recommend[0:10]

['044021145X',
 '316601950',
 '345370775',
 '345417623',
 '375727345',
 '440226430',
 '786868716']