In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


<h2>Series</h2>

In [2]:
ser1 = pd.Series(data=[1,2,3,4,5,6],index=[0,1,2,3,4,'xx'])
ser1

0     1
1     2
2     3
3     4
4     5
xx    6
dtype: int64

In [3]:
ser1[0], ser1['xx']

(1, 6)

In [4]:
ser1.loc[[0, 'xx']],  ser1.loc[0]

(0     1
 xx    6
 dtype: int64,
 1)

In [5]:
ser1.iloc[0:-1], ser1.iloc[0]

(0    1
 1    2
 2    3
 3    4
 4    5
 dtype: int64,
 1)

In [6]:
2 in ser1 ## checks for inedx and not value 

True

In [7]:
6 in ser1 ## checks for inedx and not value 

False

<h2>Dataframe</h2>

In [8]:
dict1 = {
    'one': pd.Series([10, 30, 40, 50, 60], ['xx', 'yy', 'zz', 'll', 'mm']),
    'two': pd.Series([70 , 80, 90, 100, 110], ['aa', 'bb', 'cc', 'xx', 'yy'])
}
df1 = pd.DataFrame(dict1)
df1

Unnamed: 0,one,two
aa,,70.0
bb,,80.0
cc,,90.0
ll,50.0,
mm,60.0,
xx,10.0,100.0
yy,30.0,110.0
zz,40.0,


In [9]:
df1.index, df1.columns

(Index(['aa', 'bb', 'cc', 'll', 'mm', 'xx', 'yy', 'zz'], dtype='object'),
 Index(['one', 'two'], dtype='object'))

In [10]:
df1['two']['aa']

70.0

In [11]:
df1['two']['aa'] = 67

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df1['two']['aa'] = 67


In [12]:
df1['three'] = df1['two']*df1['one']
df1

Unnamed: 0,one,two,three
aa,,67.0,
bb,,80.0,
cc,,90.0,
ll,50.0,,
mm,60.0,,
xx,10.0,100.0,1000.0
yy,30.0,110.0,3300.0
zz,40.0,,


In [13]:
df1['flag'] = df1['two']>=65
df1

Unnamed: 0,one,two,three,flag
aa,,67.0,,True
bb,,80.0,,True
cc,,90.0,,True
ll,50.0,,,False
mm,60.0,,,False
xx,10.0,100.0,1000.0,True
yy,30.0,110.0,3300.0,True
zz,40.0,,,False


In [14]:
df1.pop('flag')

aa     True
bb     True
cc     True
ll    False
mm    False
xx     True
yy     True
zz    False
Name: flag, dtype: bool

In [16]:
df1

Unnamed: 0,one,two,three
aa,,67.0,
bb,,80.0,
cc,,90.0,
ll,50.0,,
mm,60.0,,
xx,10.0,100.0,1000.0
yy,30.0,110.0,3300.0
zz,40.0,,


In [17]:
del df1['three']


In [18]:
df1

Unnamed: 0,one,two
aa,,67.0
bb,,80.0
cc,,90.0
ll,50.0,
mm,60.0,
xx,10.0,100.0
yy,30.0,110.0
zz,40.0,


In [45]:
fp1 = 'dataset/ml-latest-small'
import os
os.chdir(fp1)

In [49]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [50]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [53]:
tags = pd.read_csv('tags.csv')
tags.tail()

Unnamed: 0,userId,movieId,tag,timestamp
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978
3682,610,168248,Heroic Bloodshed,1493844270


In [54]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [55]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [57]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [58]:
ratings['rating'].min()

0.5

In [59]:
ratings['rating'].std()

1.042529239060635

In [61]:
movies.shape

(9742, 3)

In [64]:
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [66]:
ratings = ratings.dropna()
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [68]:
movies[['title', 'genres']]

Unnamed: 0,title,genres
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy
...,...,...
9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,Flint (2017),Drama
9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [72]:
is_comedy = movies['genres'].str.contains('Comedy')

In [73]:
movies[is_comedy]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
9734,193571,Silver Spoon (2014),Comedy|Drama
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy


In [78]:
t = movies.merge(tags, on='movieId', how='inner')
t.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932
