In [1]:
import numpy as np

## Red wines dataset

In [4]:
wines = np.genfromtxt("./winequality-red.csv", delimiter=",", skip_header=1)
wines

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

In [6]:
# We can use integer indexing to get a certain column or row, and empty slice for all rows
print(wines[:,0])
# But if we wanted the same values, but wanted to preserve the row format:
print(wines[:, 0:1])
# Which is the same with:
print(wines[:, [0]])

[7.4 7.8 7.8 ... 6.3 5.9 6. ]
[[7.4]
 [7.8]
 [7.8]
 ...
 [6.3]
 [5.9]
 [6. ]]
[[7.4]
 [7.8]
 [7.8]
 ...
 [6.3]
 [5.9]
 [6. ]]


In [7]:
# We can get a range of columns in order
wines[:, 0:3]

array([[7.4  , 0.7  , 0.   ],
       [7.8  , 0.88 , 0.   ],
       [7.8  , 0.76 , 0.04 ],
       ...,
       [6.3  , 0.51 , 0.13 ],
       [5.9  , 0.645, 0.12 ],
       [6.   , 0.31 , 0.47 ]])

In [9]:
# We can get several non-consecutive columns using a list
wines[:, [0, 2, 4]]

array([[7.4  , 0.   , 0.076],
       [7.8  , 0.   , 0.098],
       [7.8  , 0.04 , 0.092],
       ...,
       [6.3  , 0.13 , 0.076],
       [5.9  , 0.12 , 0.075],
       [6.   , 0.47 , 0.067]])

In [11]:
# We can do some basic summarization of the data
# For the average quality of the wine, we can select the quality column (which is the last column) using -1
wines[:, [-1]].mean()

5.6360225140712945

## Graduate school admissions

In [24]:
gsa = np.genfromtxt("Admission_Predict.csv", delimiter=",", skip_header=1, dtype=None, names=("Serial No.", "GRE Score", "TOEFL Score", "University Rating", "SOP", "LOR", "CGPA", "Research", "Chance of Admit"))
gsa

(400,)

In [17]:
# Providing the names of the columns, we can retrieve a column by name
gsa["CGPA"][0:5]

array([9.65, 8.87, 8.  , 8.67, 8.21])

In [18]:
gsa["CGPA"] = gsa["CGPA"] / 10 *4
gsa["CGPA"][0:5]

array([3.86 , 3.548, 3.2  , 3.468, 3.284])

In [19]:
# We can find otu how many students have had research experience by creating a boolean mask and passing it to the array indexing operator
len(gsa[gsa["Research"] == 1])

219

In [20]:
# Let's see if students with higher chance of admission have a higher GRE score on average
print(gsa[gsa["Chance_of_Admit"] > 0.8]["GRE_Score"].mean()) # these are the higher chance of admit guys
print(gsa[gsa["Chance_of_Admit"] < 0.4]["GRE_Score"].mean())

328.7350427350427
302.2857142857143


In [23]:
print(gsa[gsa["Chance_of_Admit"] > 0.8]["CGPA"].mean()) # these are the higher chance of admit guys
print(gsa[gsa["Chance_of_Admit"] < 0.4]["CGPA"].mean())

3.7106666666666666
3.0222857142857142
