## collecting data

In this notebook I will create a data set about movies from a base at filmweb.pl using a BeautifulSoup and try to predict a rating of movies.

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

In [None]:
#function getting information about 1 film

def get_info(link):
    
    r=requests.get(link)
    soup=bs(r.content)

    movie_info={}
    
    #info about directors,...,premiere 
    info_box=soup.find(class_='filmPosterSection__info filmInfo')

    keys=info_box.find_all(class_='filmInfo__header')
    values=info_box.find_all(class_='filmInfo__info')

    for i in range(5):
        key=keys[i].get_text(strip=True).split('(')[0]
        value=values[i].get_text(strip=True).split('(')[0]
        movie_info[key]=value
        
    #info about title and actors
    for i in range(10):
        try:
            base_link='https://www.filmweb.pl'
            actors_link=base_link+soup.find_all(class_='page__moreButton')[i].a['href']

            r2=requests.get(actors_link)
            soup2=bs(r2.content)

            movie_info['title']=soup2.find(class_='filmHeaderSection__title').get_text(strip=True)

            actors_table=soup2.find(class_='filmFullCastSection__list')
            actors_rows=actors_table.find_all(class_='castRoleListElement__info')
            movie_info['actors']=[]

            for index, row in enumerate(actors_rows):
                actor_name=row.a.get_text(strip=True)
                movie_info['actors'].append(actor_name)
                if index>10:
                    break
        except:
            continue

    #info about boxoffice,..., oryginal title
    info_box3=soup.find(class_='filmOtherInfoSection__group')
    info_rows3=info_box3.find_all(class_='filmInfo__group')

    for index, row in enumerate(info_rows3):
        if index<5:
            keys=row.find_all(class_='filmInfo__header')
            values=row.find_all(class_='filmInfo__info')
            for i in range(2):
                try:
                    key2=keys[i].get_text(strip=True).split('(')[0]
                    value2=values[i].get_text(strip=True).split('(')[0]
                    movie_info[key2]=value2
                except:
                    continue
        else:
            break
    
    #info about rating
    movie_info['rating']=soup.find(class_='filmRating__rateValue').get_text(strip=True)
            
    return movie_info

We will use link: https://www.filmweb.pl/films/search?endRate=9&orderBy=rate&descending=true&startCount=8000&startRate=8, and get information about movies with rates from 3.9 to 8.9 stars.

In [None]:
movies_info_list=[]

for l in range(6):
    k=10 #setting number of pages
    j=3+l

    for i in range(k):
        link='https://www.filmweb.pl/films/search?endRate={}'.format(j+1)+'&orderBy=rate&descending=true&startCount=8000'+'&startRate={}'.format(j)+'&page={}'.format(i+1)
        
        r=requests.get(link)
        soup=bs(r.content)
        film_list=soup.select('.wrapperContent.page__section')
        film_rows=film_list[0].find_all('li')

        for index, row in enumerate(film_rows):
            if index<80:
                try:
                    base_link='https://www.filmweb.pl'
                    link=base_link+row.find(class_='filmPreview__titleDetails').a['href']
                    movies_info_list.append(get_info(link))
                except:
                    continue

In [None]:
#creating a DataFrame
df=pd.DataFrame(movies_info_list)

In [None]:
df.columns

Removing unnecessary columns

In [None]:
df.drop(['budÅ¼et','dystrybucja','inne tytuÅ‚y','inne tytuły','na podstawie','reÅ¼yseria','tytuÅ‚ oryg.','tytuł oryg.'],axis=1
       ,inplace=True)

In [None]:
df=df.rename(columns={'budżet':'budget', 'data produkcji': 'date of production', 'gatunek': 'genre', 'premiera': 'premiere', 
           'produkcja': 'production', 'reżyseria': 'directing','scenariusz': 'scenario'})

In [None]:
df

In [None]:
#checking NaN values
sns.heatmap(df.isnull())

There is a lot of NaN values in budget and boxoffice columns so we will create 1 additional DataFrame without this colmns. We will remove rows with NaN values in studio and directing.

In [None]:
#df1 will be a dataFrame without budget and boxoffice columns
df1=df.copy()

In [None]:
df1.drop(['budget','boxoffice'],axis=1,inplace=True)

In [None]:
df=df[df['directing'].isnull()==False]
df=df[df['studio'].isnull()==False]
df=df[df['budget'].isnull()==False]
df=df[df['boxoffice'].isnull()==False]
df1=df1[df1['directing'].isnull()==False]
df1=df1[df1['studio'].isnull()==False]

In [None]:
df=df.reset_index()
df1=df1.reset_index()

In [None]:
df.drop('index',axis=1,inplace=True)
df1.drop('index',axis=1,inplace=True)

Now we will transform values in some columns. In the 'boxoffice' column we will grab only the first number which is the boxoffice in the world and set a type of this value as float, in the 'budget' column we will remove a dollar sign and also set a type as a float, values in the 'premiere' column will be a datetime objects, in the 'production', 'directing', 'scenario' and 'studio' columns we will have lists of strings instead of one string and we will set a type of rating values as float. Let's define functions in order to do that.

In [None]:
#functions to clean data
def clean_boxoffice(value):
    value= value.split('na')[0].split('$')[1]
    
    for i in range(1):
        try:
            value=value.split('w')[0]
        except:
            continue
    
    return ''.join(value.split())

def clean_scenario(value):
    #changing string into list
    lst=value.split('/')   
    length=len(lst)
    if lst[length-1]=='więcej...':
        del lst[length-1]
    elif lst[length-1]=='Więcej...':
        del lst[length-1]
    
    return lst

def clean_prod(value):
    return value.split(', ')

def clean_studio(value):
    if ' '.join(value).split()[-1]=='Więcej...':
        return ' '.join(value).split()[:-1]
    else:
        return ' '.join(value).split()

def miesiac_ang(miesiac):
    if miesiac=='stycznia' or miesiac=='styczeń':
        return 'January'
    if miesiac=='lutego' or miesiac=='luty':
        return 'February'
    if miesiac=='marca' or miesiac=='marzec':
        return 'March'
    if miesiac=='kwietnia' or miesiac=='kwiecień':
        return 'April'
    if miesiac=='maja' or miesiac=='maj':
        return 'May'
    if miesiac=='czerwca' or miesiac=='czerwiec':
        return 'June'
    if miesiac=='lipca' or miesiac=='lipiec':
        return 'July'
    if miesiac=='sierpnia' or miesiac=='sierpień':
        return 'August'
    if miesiac=='września' or miesiac=='wrzesień':
        return 'September'
    if miesiac=='października' or miesiac=='październik':
        return 'October'
    if miesiac=='listopada' or miesiac=='listopad':
        return 'November'
    if miesiac=='grudnia' or miesiac=='grudzień':
        return 'December'
    
def translating_date(date):
    lst=date.split()
    if len(lst)==3:
        lst[1]=miesiac_ang(date.split()[1])
        return ' '.join(lst)
    else:
        lst[0]=miesiac_ang(date.split()[0])
        return '1 '+' '.join(lst)

def transform_rating(value):
    return '.'.join(value.split(','))

Transforming df

In [None]:
#cleaning 'boxoffice' and 'budget' column
df['boxoffice']=df['boxoffice'].apply(clean_boxoffice)
df['boxoffice']=df['boxoffice'].apply(int)
df['budget']=df['budget'].apply(clean_boxoffice)
df['budget']=df['budget'].apply(float)
df['rating']=df['rating'].apply(transform_rating)
df['date of production']=df['date of production'].apply(float)
df['rating']=df['rating'].apply(float)


#cleaning 'scenario', 'production' and 'directing' column
df['scenario']=df['scenario'].apply(clean_scenario)
df['directing']=df['directing'].apply(clean_scenario)
df['production']=df['production'].apply(clean_scenario)
df['studio']=df['studio'].apply(clean_scenario)

#changing 'premiere' column into datetime object
df['premiere']=df['premiere'].apply(translating_date)
for i in range(df['rating'].count()):
    df['premiere'][i]=datetime.strptime(df['premiere'][i],'%d %B %Y')

Transforming df1

In [None]:
df1['rating']=df1['rating'].apply(transform_rating)
df1['date of production']=df1['date of production'].apply(float)
df1['rating']=df1['rating'].apply(float)

#cleaning 'scenario', 'production' ,'studio' and 'directing' column
df1['scenario']=df1['scenario'].apply(clean_scenario)
df1['directing']=df1['directing'].apply(clean_scenario)
df1['production']=df1['production'].apply(clean_scenario)
df1['studio']=df1['studio'].apply(clean_scenario)

#changing 'premiera' column into datetime object
df1['premiere']=df1['premiere'].apply(translating_date)
for i in range(df1['rating'].count()):
    df1['premiere'][i]=datetime.strptime(df1['premiere'][i],'%d %B %Y')

Now we will create additional columns: 'num of famous actors and actresses', 'num of famous directors' and 'num of famous screenwriters' representing number of famous actors etc. In order to do that we will create at first lists of famous actors, directors and screenwriters, that will be persons with the highest ratings at filmweb.pl.

In [None]:
df

In [None]:
def get_famous_actors(link):
    r=requests.get(link)
    soup=bs(r.content)

    actors_info=soup.find(class_='page__container rankingTypeSection__container')
    actors_rows=actors_info.find_all(class_='rankingTypePerson__header')

    for row in actors_rows:
        famous_actors_list.append(row.get_text())
    


In [None]:
def get_famous_actress(link):
    r=requests.get(link)
    soup=bs(r.content)

    actors_info=soup.find(class_='page__container rankingTypeSection__container')
    actors_rows=actors_info.find_all(class_='rankingTypePerson__header')

    for row in actors_rows:
        famous_actress_list.append(row.get_text())
    


In [None]:
def get_famous_directors(link):
    r=requests.get(link)
    soup=bs(r.content)

    actors_info=soup.find(class_='page__container rankingTypeSection__container')
    actors_rows=actors_info.find_all(class_='rankingTypePerson__header')

    for row in actors_rows:
        famous_directors_list.append(row.get_text())
    


In [None]:
def get_famous_screenwriters(link):
    r=requests.get(link)
    soup=bs(r.content)

    actors_info=soup.find(class_='page__container rankingTypeSection__container')
    actors_rows=actors_info.find_all(class_='rankingTypePerson__header')

    for row in actors_rows:
        famous_screenwriters_list.append(row.get_text())
    


In [None]:
#creating list of famous actors
famous_actors_list=[]
link='https://www.filmweb.pl/ranking/person/actors/male'
r=requests.get(link)
soup=bs(r.content)
pages=soup.find(class_='pagination__list').find_all('li')

for i in range(5):
    try:
        link=pages[i].a['href']
        base_link='https://www.filmweb.pl'
        link=base_link+link
    except:
        get_famous_actors(link)
        continue

    get_famous_actors(link)

for j in range(3):
    link='https://www.filmweb.pl/ranking/person/actors/male?page={}'.format(8+5*j)
    r=requests.get(link)
    soup=bs(r.content)
    pages=soup.find(class_='pagination__list').find_all('li')

    for i in range(5):
        try:
            link=pages[i+2].a['href']
            base_link='https://www.filmweb.pl'
            link=base_link+link
        except:
            link='https://www.filmweb.pl/ranking/person/actors/male?page={}'.format(8+5*j)

        get_famous_actors(link)

In [None]:
#creating list of famous actresses
famous_actress_list=[]
link='https://www.filmweb.pl/ranking/person/actors/female'
r=requests.get(link)
soup=bs(r.content)
pages=soup.find(class_='pagination__list').find_all('li')

for i in range(5):
    try:
        link=pages[i].a['href']
        base_link='https://www.filmweb.pl'
        link=base_link+link
        print(link)
    except:
        get_famous_actress(link)
        print(link)
        continue

    get_famous_actress(link)

for j in range(2):
    link='https://www.filmweb.pl/ranking/person/actors/female?page={}'.format(8+5*j)
    r=requests.get(link)
    soup=bs(r.content)
    pages=soup.find(class_='pagination__list').find_all('li')

    for i in range(5):
        try:
            link=pages[i+2].a['href']
            base_link='https://www.filmweb.pl'
            link=base_link+link
            print(link)
        except:
            link='https://www.filmweb.pl/ranking/person/actors/female?page={}'.format(8+5*j)
            print(link)

        get_famous_actress(link)

In [None]:
#creating list of famous directors
famous_directors_list=[]
link='https://www.filmweb.pl/ranking/person/director'
r=requests.get(link)
soup=bs(r.content)
pages=soup.find(class_='pagination__list').find_all('li')

for i in range(5):
    try:
        link=pages[i].a['href']
        base_link='https://www.filmweb.pl'
        link=base_link+link
        print(link)
    except:
        get_famous_directors(link)
        print(link)
        continue

    get_famous_directors(link)


link='https://www.filmweb.pl/ranking/person/director?page=5'
r=requests.get(link)
soup=bs(r.content)
pages=soup.find(class_='pagination__list').find_all('li')

for i in range(2):
    try:
        link=pages[i+5].a['href']
        base_link='https://www.filmweb.pl'
        link=base_link+link
        print(link)
    except:
        link='https://www.filmweb.pl/ranking/person/director?page=5'
        print(link)

    get_famous_directors(link)

In [None]:
#creating list of famous screenwriters
famous_screenwriters_list=[]
link='https://www.filmweb.pl/ranking/person/screenwriter'
r=requests.get(link)
soup=bs(r.content)
pages=soup.find(class_='pagination__list').find_all('li')

for i in range(5):
    try:
        link=pages[i].a['href']
        base_link='https://www.filmweb.pl'
        link=base_link+link
        print(link)
    except:
        get_famous_screenwriters(link)
        print(link)
        continue

    get_famous_screenwriters(link)


for j in range(3):
    link='https://www.filmweb.pl/ranking/person/screenwriter?page={}'.format(8+5*j)
    r=requests.get(link)
    soup=bs(r.content)
    pages=soup.find(class_='pagination__list').find_all('li')

    for i in range(5):
        try:
            link=pages[i+2].a['href']
            base_link='https://www.filmweb.pl'
            link=base_link+link
            print(link)
        except:
            link='https://www.filmweb.pl/ranking/person/screenwriter?page={}'.format(8+5*j)
            print(link)

        get_famous_screenwriters(link)

In [None]:
famous_actors_actress_list=famous_actors_list + famous_actress_list
len(famous_actors_actress_list)

Creating a columns.

In [None]:
df['num of famous actors and actresses']=0
df['num of famous directors']=0
df['num of famous screenwriters']=0

In [None]:
for i in range(df['rating'].count()):
    for j in range(10):
        try:
            if df['actors'][i][j] in famous_actors_actress_list:
                df['num of famous actors and actresses'][i]+=1
        except:
            continue
            
for i in range(df['rating'].count()):
    for j in range(3):
        try:
            if df['directing'][i][j] in famous_directors_list:
                df['num of famous directors'][i]+=1
        except:
            continue
            
for i in range(df['rating'].count()):
    for j in range(3):
        try:
            if df['scenario'][i][j] in famous_screenwriters_list:
                df['num of famous screenwriters'][i]+=1
        except:
            continue

In [None]:
df1['num of famous actors and actresses']=0
df1['num of famous directors']=0
df1['num of famous screenwriters']=0

In [None]:
for i in range(df1['rating'].count()):
    for j in range(10):
        try:
            if df1['actors'][i][j] in famous_actors_actress_list:
                df1['num of famous actors and actresses'][i]+=1
        except:
            continue
            
for i in range(df1['rating'].count()):
    for j in range(3):
        try:
            if df1['directing'][i][j] in famous_directors_list:
                df1['num of famous directors'][i]+=1
        except:
            continue
            
for i in range(df1['rating'].count()):
    for j in range(3):
        try:
            if df1['scenario'][i][j] in famous_screenwriters_list:
                df1['num of famous screenwriters'][i]+=1
        except:
            continue

In [None]:
df

Now we will ceate columns for each film genre and country where the movie was produced.

In [None]:
df=pd.get_dummies(df, columns=['genre'])
df1=pd.get_dummies(df1, columns=['genre'])

In [None]:
lst_of_countries=[]
for i in range(df['rating'].count()):
    for j in range(15):
        try:
            if df['production'][i][j] not in lst_of_countries:
                lst_of_countries.append(df['production'][i][j])
        except:
            continue

In [None]:
lst_of_countries

In [None]:
for i in range(len(lst_of_countries)):
    df['prod_{}'.format(lst_of_countries[i])]=0

In [None]:
for i in range(df['rating'].count()):
    for j in range(15):
        try:
            country=df['production'][i][j]
            df['prod_{}'.format(country)][i]+=1
        except:
            continue

In [None]:
lst_of_countries=[]
for i in range(df1['rating'].count()):
    for j in range(15):
        try:
            if df1['production'][i][j] not in lst_of_countries:
                lst_of_countries.append(df1['production'][i][j])
        except:
            continue

In [None]:
for i in range(len(lst_of_countries)):
    df1['prod_{}'.format(lst_of_countries[i])]=0

In [None]:
for i in range(df1['rating'].count()):
    for j in range(15):
        try:
            country=df1['production'][i][j]
            df1['prod_{}'.format(country)][i]+=1
        except:
            continue

In [None]:
df

Saving our data as csv file.

In [None]:
df.to_csv('filmweb_data')
df1.to_csv('filmweb_data2')

## Exploratory data analysis

In [None]:
df=pd.read_csv('filmweb_data')
df1=pd.read_csv('filmweb_data2')

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df1.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df['production'][1]

After loading data from file we have strings instead of lists so we will define function which will transform them into lists again.

In [None]:
#function which transform a string into a list (for 'production' and 'studio' columns)
def string_to_list(x):
    x= x.split('[')[1].split(']')[0].split("'")
    for i in range(len(x)):
        try:
            if x[i]==', ':
                x.remove(', ')
        except:
            continue
    x=x[1:-1]
    return x

In [None]:
df['production']=df['production'].apply(string_to_list)
df1['production']=df1['production'].apply(string_to_list)

df1['studio']=df1['studio'].apply(string_to_list)
df['studio']=df['studio'].apply(string_to_list)

Let's create a new column 'revenue' and check how correlation matrix looks like.

In [None]:
df['revenue']=df['boxoffice']/df['budget']
df1['revenue']=df['boxoffice']/df['budget']

In [None]:
sns.heatmap(df[['date of production','rating','boxoffice','budget','num of famous actors and actresses',
                 'num of famous directors','num of famous screenwriters','revenue']].corr(), cmap='coolwarm')

In [None]:
sns.heatmap(df1[['date of production','rating','num of famous actors and actresses',
                 'num of famous directors','num of famous screenwriters','revenue']].corr(), cmap='coolwarm')

We can see that the biggest impact on a rating has a number of famous directors and screenwriters, on the otehr hand a boxoffice and a budget have one of the least impacts so we will not use them for building a model. Now we will check how a mean rating and a revenue of a movies depends on a different variables.

In [None]:
lst=[]
for i in range(df1['rating'].count()):
    for j in range(5):
        try:
            country=df1['production'][i][j]
            if country not in lst:
                lst.append(country)
        except:
            continue
len(lst)

In [None]:
countries_mean_rating=pd.DataFrame([[0,0] for i in range(len(lst))])
for i in range(len(lst)):
    lst2=[] #list with rating
    for j in range(df1['rating'].count()):
        try:
            if lst[i] in df1['production'][j]:
                lst2.append(df1.loc[j]['rating'])
        except:
            continue
    countries_mean_rating.loc[i]=[lst[i], np.mean(lst2)]

In [None]:
countries_mean_rating=countries_mean_rating.rename(columns={0:'country', 1:'mean rating'})

In [None]:
#creating a column 'num_of_movies' which represents how many movies was produced by a given country
countries_mean_rating['num_of_movies']=0

In [None]:
for i in range(len(lst)):
    country=lst[i]
    num_of_movies=0
    for j in range(df1['rating'].count()):
        if country in df1['production'][j]:
            num_of_movies+=1
    countries_mean_rating['num_of_movies'][i]=num_of_movies

In [None]:
countries_mean_rating.sort_values(by='mean rating',axis=0, ascending=False)

We see that countries which have the biggest mean rating have also very little number of movies produced, from among 8 countries which have produced the most movies, Italy has the biggest mean rating.

In [None]:
fig,axes = plt.subplots(figsize=(12,3))
axes.plot(df1.groupby('num of famous actors and actresses').mean()['rating'])
axes.set_ylabel('mean rating')
axes.set_xlabel('num of famous actors and ctresses')
plt.tight_layout()

A number of famous actors doesn't seem to have a big impact on a rating, only movies with 8 and 9 famous actors has significantly bigger mean rating.

In [None]:
pd.DataFrame(df1.groupby('num of famous directors').mean().sort_values('rating',ascending=False)['rating'])

In [None]:
pd.DataFrame(df1.groupby('num of famous screenwriters').mean().sort_values('rating',ascending=False)['rating'])

We can see that if we have at least one famous screenwriter or director then a mean rating is much bigger, but there is no difference if we have one or two famous screenwriters or directors.

In [None]:
fig,axes=plt.subplots(figsize=(12,3))
axes.plot(df1.groupby('date of production').mean()['rating'])
axes.set_xlabel('date of production')
axes.set_ylabel('mean rating')
plt.tight_layout()

The movies produced after around 1980 start to getting smaller mean rate and the biggest drop is between 2019 and 2020. Let's check a number of movies with rating above 7.5 which was produced before and after 1980.

In [None]:
print('Number of movies with rating above 7.5 produced after 1980: {}'
      .format(df1[(df1['date of production']>1980)&(df1['rating']>7.5)]['rating'].count()))

In [None]:
print('Number of movies with rating above 7.5 produced before 1980: {}'
      .format(df1[(df1['date of production']<1980)&(df1['rating']>7.5)]['rating'].count()))

We can see that even though a mean rating of movies produced after 1980 start to getting smaller, a number of movies with a rating above 7.5 is almost two times bigger. Now let's see what are the movies with the biggest revenue and how it depends on a number of famous actors, screenwriters and directors.

In [None]:
df[['title','revenue']].sort_values(by='revenue',ascending=False)

In [None]:
pd.DataFrame(df.groupby('num of famous actors and actresses').mean().sort_values('revenue',ascending=False)['revenue'])

In [None]:
pd.DataFrame(df.groupby('num of famous directors').mean().sort_values('revenue',ascending=False)['revenue'])

In [None]:
pd.DataFrame(df.groupby('num of famous screenwriters').mean().sort_values('revenue',ascending=False)['revenue'])

Now we will create some models and try to predict a rating of a movie. In order to check which one is the best we will calculate a root mean square error.

## model 1- Support Vector Regression

In [None]:
df=pd.read_csv('filmweb_data')
df1=pd.read_csv('filmweb_data2')

In [None]:
df1.drop(['Unnamed: 0','actors','premiere','production','directing','scenario','studio','title'],axis=1,inplace=True)
df.drop(['Unnamed: 0','actors','premiere','production','directing','scenario','studio','title'],axis=1,inplace=True)

In [None]:
x=df1.drop('rating',axis=1)
y=df1['rating']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [None]:
from sklearn import svm
regr = svm.SVR()
regr.fit(x_train,y_train)
pred=regr.predict(x_test)

from sklearn import metrics
import numpy as np
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

## model 2- KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
model=KNeighborsRegressor()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

## model 3- GaussianProcessRegressor

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
model=GaussianProcessRegressor()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

## model 4- DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

We can see that the best one is Support Vector Regression. Let's try to improve it with GridSearch.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param={'C':[0.001,0.01,0.1,1,10,100,1000],'epsilon':[0.001,0.01,0.1,1,10,100,1000]}
grid = GridSearchCV(svm.SVR(),param,refit=True,verbose=3)

In [None]:
grid.fit(x_train,y_train)

In [None]:
grid.best_params_

In [None]:
pred=grid.predict(x_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

Now let's try classification models. In order to do that, we will round up rating values so a type of them is int.

In [None]:
df1['rating'].value_counts()

In [None]:
for i in range(df1['rating'].count()):
    if 3<=df1['rating'][i]<=3.4:
        df1['rating'][i]=3
    elif 3.5<=df1['rating'][i]<=4.4:
        df1['rating'][i]=4
    elif 4.5<=df1['rating'][i]<=5.4:
        df1['rating'][i]=5
    elif 5.5<=df1['rating'][i]<=6.4:
        df1['rating'][i]=6
    elif 6.5<=df1['rating'][i]<7.4:
        df1['rating'][i]=7
    elif 7.5<=df1['rating'][i]<=8.4:
        df1['rating'][i]=8
    else:
        df1['rating'][i]=9

In [None]:
df1['rating'].value_counts()

In [None]:
x=df1.drop('rating',axis=1)
y=df1['rating']

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.3)

## model 1- KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
error=[]
for i in range(50):
    model=KNeighborsClassifier(n_neighbors=i+1)
    model.fit(x_train,y_train)
    pred=model.predict(x_test)
    error.append(np.mean(np.array(pred)!=np.array(y_test)))


In [None]:
plt.plot(error)

In [None]:
model=KNeighborsClassifier(n_neighbors=49)
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 2- SVC

In [None]:
from sklearn.svm import SVC
model=SVC()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 3- SGDC

In [None]:
from sklearn.linear_model import SGDClassifier
model=SGDClassifier()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 4- GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 5- Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

 ## model 6- Complement NB

In [None]:
from sklearn.naive_bayes import ComplementNB
model=ComplementNB()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 7- Bernoulli NB

In [None]:
from sklearn.naive_bayes import BernoulliNB
model=ComplementNB()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 7- Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 8- Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 9- Decission Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## model 10- Gaussian Process

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
model=GaussianProcessClassifier()
model.fit(x_train,y_train)
pred=model.predict(x_test)

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

It's quite diffucult to decide which model is the best. The logistic regression has the highest average f1- score but some of the other models are better for predicting some ratings, for example for predicting rating 4 the highest f1- score has the GaussianNB model which amounts 0.25.