In [1]:
import numpy as np

arr_32 = np.array([1.5] * 1000000, dtype = np.float32)

In [2]:
arr_32

array([1.5, 1.5, 1.5, ..., 1.5, 1.5, 1.5], dtype=float32)

In [3]:
print(arr_32.nbytes, "bytes")

4000000 bytes


In [4]:
arr1 = np.array([[1., 2., 3.], [4., 1., 6.]])
arr2 = np.array([[0.5, 1., 1.], [2., 2., -1.]])

In [5]:
print(arr1 * arr2)

[[ 0.5  2.   3. ]
 [ 8.   2.  -6. ]]


In [6]:
from timeit import default_timer as timer
size = 1000000  # one million elements
a_list = [i for i in range(size)]
b_list = [i for i in range(size, 0, -1)]
a_array = np.array(a_list)
b_array = np.array(b_list)

In [7]:
start = timer()
result_loop = []
for i in range(len(a_list)):
    result_loop.append(a_list[i] + b_list[i])
end = timer()
loop_time = end - start
print(f"Traditional for loop: {loop_time:.3f} seconds")

Traditional for loop: 0.107 seconds


In [8]:
start = timer()
result_list_comp = [a_list[i] + b_list[i] for i in range(len(a_list))]
end = timer()
list_comp_time = end - start
print(f"List comprehension: {list_comp_time:.3f} seconds")

List comprehension: 0.062 seconds


In [9]:
start = timer()
result_numpy = a_array + b_array
end = timer()
numpy_time = end - start
print(f"NumPy vectorized addition: {numpy_time:.3f} seconds")

NumPy vectorized addition: 0.005 seconds


In [10]:
print(arr4)
arr4[0]
arr4[-2]
arr4[1:3]

NameError: name 'arr4' is not defined

In [11]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(arr2d)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [12]:
print(arr2d)
print(arr2d[2][-1])
print(arr2d[2, -1])

[[1 2 3]
 [4 5 6]
 [7 8 9]]
9
9


In [13]:
arr2d[2]

array([7, 8, 9])

In [14]:
arr2d[2, :]

array([7, 8, 9])

In [15]:
arr2d[2:, :]

array([[7, 8, 9]])

In [16]:
import pandas as pd

height = pd.Series([163, 156, 177])
print(height)

0    163
1    156
2    177
dtype: int64


In [17]:
height2 = pd.Series([163, 156, 177],
                    index = ['Lee', 'Kim', 'Jung'])
print(height2)

Lee     163
Kim     156
Jung    177
dtype: int64


In [18]:
height2 = pd.Series([163, 156, 177],
                    index = ['Lee', 'Kim', 'Kim'])
print(height2)

Lee    163
Kim    156
Kim    177
dtype: int64


In [19]:
height2.loc['Kim']

Kim    156
Kim    177
dtype: int64

In [21]:
height2.index = ['Choi', 'Han', 'Han']
print(height2)

Choi    163
Han     156
Han     177
dtype: int64


In [22]:
height2.loc['Han']

Han    156
Han    177
dtype: int64

In [23]:
height2.name = 'name_height'
print(height2)

Choi    163
Han     156
Han     177
Name: name_height, dtype: int64


In [24]:
height2.index.name = 'name'
print(height2)

name
Choi    163
Han     156
Han     177
Name: name_height, dtype: int64


In [25]:
print(height[0:2])
print(height[[0, 2]])

0    163
1    156
dtype: int64
0    163
2    177
dtype: int64


In [26]:
print(height2)
height2['Choi':'Park'] # inclusive with string indices
height2[:2] # it also works with positions

name
Choi    163
Han     156
Han     177
Name: name_height, dtype: int64


name
Choi    163
Han     156
Name: name_height, dtype: int64

In [27]:
obj = pd.Series([1, 2, 3], index = [2, 0, 1])
print(obj)
print(obj[[0, 1, 2]])

2    1
0    2
1    3
dtype: int64
0    2
1    3
2    1
dtype: int64


In [28]:
obj.sort_index()

0    2
1    3
2    1
dtype: int64

In [35]:
height[height > 160]
feet = height * 0.0328084
np.exp(pd.Series([1, 2, 3]))

0    163
2    177
dtype: int64

In [34]:
grades = pd.Series([np.nan, 2, 'b', 'a+',  'a', 'f'])
grades.head(3) # similarly tail()
grades.value_counts(dropna = False)

NaN    1
2      1
b      1
a+     1
a      1
f      1
dtype: int64

In [36]:
grades.head(3) # similarly tail()

0    NaN
1      2
2      b
dtype: object

In [37]:
grades.isin(['a+', 'a', 'b+', 'b'])
grades.isnull() # the opposite is notnull()

0     True
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [47]:
grades[grades.isin(['a+', 'a', 'b+', 'b'])]

2     b
3    a+
4     a
dtype: object

In [39]:
grades.dropna()
grades.fillna('f')

0     f
1     2
2     b
3    a+
4     a
5     f
dtype: object

In [48]:
grades

0    NaN
1      2
2      b
3     a+
4      a
5      f
dtype: object

In [42]:
grades.str.contains('a|a+')
# grades.str.replace('a+', 'a')

0      NaN
1      NaN
2    False
3     True
4     True
5    False
dtype: object

In [46]:
grades.str.replace('a+', 'a')

  grades.str.replace('a+', 'a')


0    NaN
1    NaN
2      b
3     a+
4      a
5      f
dtype: object

In [50]:
type(grades)

pandas.core.series.Series

In [51]:
grades.dtypes

dtype('O')

In [52]:
df = pd.DataFrame(
  {'age': [23, 34, 23, 45, 67, 26],
  'income': [30000, 56000, None, 112000, 179000, 78000],
  'gender': ['F', None, 'M', 'F', 'F', 'M'],
  'married': [True, False, False, False, True, True]},
  index = ['Jessica', 'Jisoo', 'Peter', 'Susan', 'Rui', 'Alex'])
print(df)

         age    income gender  married
Jessica   23   30000.0      F     True
Jisoo     34   56000.0   None    False
Peter     23       NaN      M    False
Susan     45  112000.0      F    False
Rui       67  179000.0      F     True
Alex      26   78000.0      M     True


In [53]:
print(df.loc['Alex'])
print(df.loc[['Jisoo', 'Alex']])

age             26
income     78000.0
gender           M
married       True
Name: Alex, dtype: object
       age   income gender  married
Jisoo   34  56000.0   None    False
Alex    26  78000.0      M     True


In [54]:
type(df.loc['Alex'])

pandas.core.series.Series

In [55]:

print(df.loc['Jisoo':'Alex'])

       age    income gender  married
Jisoo   34   56000.0   None    False
Peter   23       NaN      M    False
Susan   45  112000.0      F    False
Rui     67  179000.0      F     True
Alex    26   78000.0      M     True


In [57]:
df.iloc[0]

age             23
income     30000.0
gender           F
married       True
Name: Jessica, dtype: object

In [59]:
df.index = [x for x in range(0,6)]

In [60]:
df.loc[0]

age             23
income     30000.0
gender           F
married       True
Name: 0, dtype: object

In [61]:
print(df.iloc[0:3, :])

   age   income gender  married
0   23  30000.0      F     True
1   34  56000.0   None    False
2   23      NaN      M    False


In [62]:
df.index = range(1001, 1007) # let see this as survey
df = df.reindex(range(1001, 1007))
print(df)

      age    income gender  married
1001   23   30000.0      F     True
1002   34   56000.0   None    False
1003   23       NaN      M    False
1004   45  112000.0      F    False
1005   67  179000.0      F     True
1006   26   78000.0      M     True


In [63]:
print(df.iloc[0:3, :])

      age   income gender  married
1001   23  30000.0      F     True
1002   34  56000.0   None    False
1003   23      NaN      M    False


In [64]:
print(df.iloc[0:3, :]) # non-inclusive

      age   income gender  married
1001   23  30000.0      F     True
1002   34  56000.0   None    False
1003   23      NaN      M    False


In [65]:
print(df[['age', 'income']].iloc[:3])

      age   income
1001   23  30000.0
1002   34  56000.0
1003   23      NaN


In [66]:
print(df.loc[1001:1002])

      age   income gender  married
1001   23  30000.0      F     True
1002   34  56000.0   None    False


In [68]:
# print(df.iloc[1001]) 에러 ....위치니까 에러다

In [70]:
obj = pd.Series(np.arange(4),
                index=['d', 'a', 'b', 'c'])
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index=['y', 'x'],
                    columns = ['d', 'a', 'b', 'c'])
print(obj)
print(frame)

d    0
a    1
b    2
c    3
dtype: int64
   d  a  b  c
y  0  1  2  3
x  4  5  6  7


In [71]:
print(obj.sort_index())

a    1
b    2
c    3
d    0
dtype: int64


In [72]:
print(frame.sort_index())

   d  a  b  c
x  4  5  6  7
y  0  1  2  3


In [73]:
print(frame.sort_index(axis = 'columns',
                       ascending = False))

   d  c  b  a
y  0  3  2  1
x  4  7  6  5


In [75]:

frame = pd.DataFrame({'b': [4, 7, -3, 2],
                      'a': [0, 1, 0, 1]})
print(frame)

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1


In [76]:
print(frame.sort_values(['a'], ascending = False))

   b  a
1  7  1
3  2  1
0  4  0
2 -3  0


In [77]:
print(frame.sort_values(['a', 'b'])) 

   b  a
2 -3  0
0  4  0
3  2  1
1  7  1


In [78]:
ser = pd.Series([4.3, 3.3, np.nan, None, 0])
print(ser.isna())

0    False
1    False
2     True
3     True
4    False
dtype: bool


In [79]:
print(ser.isnull())

0    False
1    False
2     True
3     True
4    False
dtype: bool


In [80]:
ser = pd.Series([4.3, 3.3, np.nan, None, 0])
print(ser.dropna()) # re-assignment necessary

0    4.3
1    3.3
4    0.0
dtype: float64


In [82]:
data = pd.DataFrame([[1., 6.5, 3.],
                    [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan],
                    [np.nan, 6.5, 3.]])

print(data)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [83]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'] + ['one'],
                     'k2': [1, 1, 2, 3, 3, 4, 4, 3]})
print(data)

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4
7  one   3


In [84]:
print(data.duplicated())

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool


In [85]:
print(data.drop_duplicates())

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4


In [86]:
data['k3'] = range(8)
print(data)

    k1  k2  k3
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6
7  one   3   7


In [None]:
print(data.drop_duplicates(subset = 'k1', keep = 'first'))
